diff options
489 files changed, 44853 insertions, 18771 deletions
diff --git a/Add-visibility-protected-attribute-for-global-variab.patch b/Add-visibility-protected-attribute-for-global-variab.patch new file mode 100644 index 000000000..f15a73caf --- /dev/null +++ b/Add-visibility-protected-attribute-for-global-variab.patch @@ -0,0 +1,77 @@ +From 0d88e15454b632d92404dd6a7181c58d9985e2a2 Mon Sep 17 00:00:00 2001 +From: Rahul Chaudhry <rahulchaudhry@google.com> +Date: Tue, 9 May 2017 12:00:58 -0700 +Subject: [PATCH] Add visibility="protected" attribute for global variables + referenced in asm files. + +During aosp builds with binutils-2.27, we're seeing linker error +messages of this form: +libvpx.a(subpixel_mmx.o): relocation R_386_GOTOFF against preemptible +symbol vp8_bilinear_filters_x86_8 cannot be used when making a shared +object + +subpixel_mmx.o is assembled from "vp8/common/x86/subpixel_mmx.asm". +Other messages refer to symbol references from deblock_sse2.o and +subpixel_sse2.o, also assembled from asm files. + +This change marks such symbols as having "protected" visibility. This +satisfies the linker as the symbols are not preemptible from outside +the shared library now, which I think is the original intent anyway. + +Change-Id: I2817f7a5f43041533d65ebf41aefd63f8581a452 +--- + vp8/common/x86/filter_x86.c | 3 ++- + vpx_dsp/deblock.c | 4 ++-- + vpx_ports/mem.h | 6 ++++++ + 3 files changed, 10 insertions(+), 3 deletions(-) + +diff --git a/vp8/common/x86/filter_x86.c b/vp8/common/x86/filter_x86.c +index 2405342f0..73435a7dd 100644 +--- a/vp8/common/x86/filter_x86.c ++++ b/vp8/common/x86/filter_x86.c +@@ -17,7 +17,8 @@ DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) = { + { 32, 32, 32, 32, 96, 96, 96, 96 }, { 16, 16, 16, 16, 112, 112, 112, 112 } + }; + +-DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) = { ++DECLARE_PROTECTED(DECLARE_ALIGNED(16, const short, ++ vp8_bilinear_filters_x86_8[8][16])) = { + { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 }, + { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 }, +diff --git a/vpx_dsp/deblock.c b/vpx_dsp/deblock.c +index a0db1e40c..3734ac251 100644 +--- a/vpx_dsp/deblock.c ++++ b/vpx_dsp/deblock.c +@@ -10,9 +10,9 @@ + #include <assert.h> + #include <stdlib.h> + #include "./vpx_dsp_rtcd.h" +-#include "vpx/vpx_integer.h" ++#include "vpx_ports/mem.h" + +-const int16_t vpx_rv[] = { ++DECLARE_PROTECTED(const int16_t vpx_rv[]) = { + 8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3, 14, + 4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0, + 3, 14, 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8, +diff --git a/vpx_ports/mem.h b/vpx_ports/mem.h +index bfef783b1..35751cef8 100644 +--- a/vpx_ports/mem.h ++++ b/vpx_ports/mem.h +@@ -23,6 +23,12 @@ + #define DECLARE_ALIGNED(n, typ, val) typ val + #endif + ++#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(_WIN32) ++#define DECLARE_PROTECTED(decl) decl __attribute__((visibility("protected"))) ++#else ++#define DECLARE_PROTECTED(decl) decl ++#endif ++ + #if HAVE_NEON && defined(_MSC_VER) + #define __builtin_prefetch(x) + #endif +-- +2.15.1 + diff --git a/Android.bp b/Android.bp index ad834ebcb..432246f14 100644 --- a/Android.bp +++ b/Android.bp @@ -2,7 +2,6 @@ // Generated from Android.bp.in, run ./generate_config.sh to regenerate libvpx_arm_neon_c_srcs = [ - "libvpx/vp8/common/alloccommon.c", "libvpx/vp8/common/arm/loopfilter_arm.c", "libvpx/vp8/common/arm/neon/bilinearpredict_neon.c", "libvpx/vp8/common/arm/neon/copymem_neon.c", @@ -19,142 +18,34 @@ libvpx_arm_neon_c_srcs = [ "libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c", "libvpx/vp8/common/arm/neon/sixtappredict_neon.c", "libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c", - "libvpx/vp8/common/blockd.c", - "libvpx/vp8/common/copy_c.c", - "libvpx/vp8/common/dequantize.c", - "libvpx/vp8/common/entropy.c", - "libvpx/vp8/common/entropymode.c", - "libvpx/vp8/common/entropymv.c", - "libvpx/vp8/common/extend.c", - "libvpx/vp8/common/filter.c", - "libvpx/vp8/common/findnearmv.c", - "libvpx/vp8/common/generic/systemdependent.c", - "libvpx/vp8/common/idct_blk.c", - "libvpx/vp8/common/idctllm.c", - "libvpx/vp8/common/loopfilter_filters.c", - "libvpx/vp8/common/mbpitch.c", - "libvpx/vp8/common/modecont.c", - "libvpx/vp8/common/quant_common.c", - "libvpx/vp8/common/reconinter.c", - "libvpx/vp8/common/reconintra.c", - "libvpx/vp8/common/reconintra4x4.c", - "libvpx/vp8/common/rtcd.c", - "libvpx/vp8/common/setupintrarecon.c", - "libvpx/vp8/common/swapyv12buffer.c", - "libvpx/vp8/common/treecoder.c", - "libvpx/vp8/common/vp8_loopfilter.c", - "libvpx/vp8/decoder/dboolhuff.c", - "libvpx/vp8/decoder/decodeframe.c", - "libvpx/vp8/decoder/decodemv.c", - "libvpx/vp8/decoder/detokenize.c", - "libvpx/vp8/decoder/onyxd_if.c", - "libvpx/vp8/decoder/threading.c", "libvpx/vp8/encoder/arm/neon/denoising_neon.c", "libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c", "libvpx/vp8/encoder/arm/neon/shortfdct_neon.c", "libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c", - "libvpx/vp8/encoder/bitstream.c", - "libvpx/vp8/encoder/boolhuff.c", - "libvpx/vp8/encoder/dct.c", - "libvpx/vp8/encoder/denoising.c", - "libvpx/vp8/encoder/encodeframe.c", - "libvpx/vp8/encoder/encodeintra.c", - "libvpx/vp8/encoder/encodemb.c", - "libvpx/vp8/encoder/encodemv.c", - "libvpx/vp8/encoder/ethreading.c", - "libvpx/vp8/encoder/lookahead.c", - "libvpx/vp8/encoder/mcomp.c", - "libvpx/vp8/encoder/modecosts.c", - "libvpx/vp8/encoder/onyx_if.c", - "libvpx/vp8/encoder/pickinter.c", - "libvpx/vp8/encoder/picklpf.c", - "libvpx/vp8/encoder/ratectrl.c", - "libvpx/vp8/encoder/rdopt.c", - "libvpx/vp8/encoder/segmentation.c", - "libvpx/vp8/encoder/tokenize.c", - "libvpx/vp8/encoder/treewriter.c", - "libvpx/vp8/encoder/vp8_quantize.c", - "libvpx/vp8/vp8_cx_iface.c", - "libvpx/vp8/vp8_dx_iface.c", - "libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c", - "libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c", - "libvpx/vp9/common/vp9_alloccommon.c", - "libvpx/vp9/common/vp9_blockd.c", - "libvpx/vp9/common/vp9_common_data.c", - "libvpx/vp9/common/vp9_entropy.c", - "libvpx/vp9/common/vp9_entropymode.c", - "libvpx/vp9/common/vp9_entropymv.c", - "libvpx/vp9/common/vp9_filter.c", - "libvpx/vp9/common/vp9_frame_buffers.c", - "libvpx/vp9/common/vp9_idct.c", - "libvpx/vp9/common/vp9_loopfilter.c", - "libvpx/vp9/common/vp9_mvref_common.c", - "libvpx/vp9/common/vp9_pred_common.c", - "libvpx/vp9/common/vp9_quant_common.c", - "libvpx/vp9/common/vp9_reconinter.c", - "libvpx/vp9/common/vp9_reconintra.c", - "libvpx/vp9/common/vp9_rtcd.c", - "libvpx/vp9/common/vp9_scale.c", - "libvpx/vp9/common/vp9_scan.c", - "libvpx/vp9/common/vp9_seg_common.c", - "libvpx/vp9/common/vp9_thread_common.c", - "libvpx/vp9/common/vp9_tile_common.c", - "libvpx/vp9/decoder/vp9_decodeframe.c", - "libvpx/vp9/decoder/vp9_decodemv.c", - "libvpx/vp9/decoder/vp9_decoder.c", - "libvpx/vp9/decoder/vp9_detokenize.c", - "libvpx/vp9/decoder/vp9_dsubexp.c", - "libvpx/vp9/decoder/vp9_dthread.c", "libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c", - "libvpx/vp9/encoder/arm/neon/vp9_error_neon.c", + "libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c", "libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c", - "libvpx/vp9/encoder/vp9_alt_ref_aq.c", - "libvpx/vp9/encoder/vp9_aq_360.c", - "libvpx/vp9/encoder/vp9_aq_complexity.c", - "libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c", - "libvpx/vp9/encoder/vp9_aq_variance.c", - "libvpx/vp9/encoder/vp9_bitstream.c", - "libvpx/vp9/encoder/vp9_context_tree.c", - "libvpx/vp9/encoder/vp9_cost.c", - "libvpx/vp9/encoder/vp9_dct.c", - "libvpx/vp9/encoder/vp9_encodeframe.c", - "libvpx/vp9/encoder/vp9_encodemb.c", - "libvpx/vp9/encoder/vp9_encodemv.c", - "libvpx/vp9/encoder/vp9_encoder.c", - "libvpx/vp9/encoder/vp9_ethread.c", - "libvpx/vp9/encoder/vp9_extend.c", - "libvpx/vp9/encoder/vp9_firstpass.c", - "libvpx/vp9/encoder/vp9_frame_scale.c", - "libvpx/vp9/encoder/vp9_lookahead.c", - "libvpx/vp9/encoder/vp9_mbgraph.c", - "libvpx/vp9/encoder/vp9_mcomp.c", - "libvpx/vp9/encoder/vp9_multi_thread.c", - "libvpx/vp9/encoder/vp9_noise_estimate.c", - "libvpx/vp9/encoder/vp9_picklpf.c", - "libvpx/vp9/encoder/vp9_pickmode.c", - "libvpx/vp9/encoder/vp9_quantize.c", - "libvpx/vp9/encoder/vp9_ratectrl.c", - "libvpx/vp9/encoder/vp9_rd.c", - "libvpx/vp9/encoder/vp9_rdopt.c", - "libvpx/vp9/encoder/vp9_resize.c", - "libvpx/vp9/encoder/vp9_segmentation.c", - "libvpx/vp9/encoder/vp9_skin_detection.c", - "libvpx/vp9/encoder/vp9_speed_features.c", - "libvpx/vp9/encoder/vp9_subexp.c", - "libvpx/vp9/encoder/vp9_svc_layercontext.c", - "libvpx/vp9/encoder/vp9_temporal_filter.c", - "libvpx/vp9/encoder/vp9_tokenize.c", - "libvpx/vp9/encoder/vp9_treewriter.c", - "libvpx/vp9/vp9_cx_iface.c", - "libvpx/vp9/vp9_dx_iface.c", - "libvpx/vpx/src/vpx_codec.c", - "libvpx/vpx/src/vpx_decoder.c", - "libvpx/vpx/src/vpx_encoder.c", - "libvpx/vpx/src/vpx_image.c", "libvpx/vpx_dsp/arm/avg_neon.c", + "libvpx/vpx_dsp/arm/avg_pred_neon.c", + "libvpx/vpx_dsp/arm/fdct16x16_neon.c", + "libvpx/vpx_dsp/arm/fdct32x32_neon.c", "libvpx/vpx_dsp/arm/fdct_neon.c", + "libvpx/vpx_dsp/arm/fdct_partial_neon.c", "libvpx/vpx_dsp/arm/fwd_txfm_neon.c", "libvpx/vpx_dsp/arm/hadamard_neon.c", + "libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c", + "libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c", + "libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c", + "libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c", + "libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c", + "libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c", + "libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c", + "libvpx/vpx_dsp/arm/highbd_intrapred_neon.c", + "libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c", + "libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c", + "libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c", + "libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c", + "libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c", "libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c", "libvpx/vpx_dsp/arm/idct16x16_add_neon.c", "libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c", @@ -164,41 +55,21 @@ libvpx_arm_neon_c_srcs = [ "libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c", "libvpx/vpx_dsp/arm/idct8x8_add_neon.c", "libvpx/vpx_dsp/arm/intrapred_neon.c", + "libvpx/vpx_dsp/arm/quantize_neon.c", "libvpx/vpx_dsp/arm/sad4d_neon.c", "libvpx/vpx_dsp/arm/sad_neon.c", "libvpx/vpx_dsp/arm/subpel_variance_neon.c", "libvpx/vpx_dsp/arm/subtract_neon.c", "libvpx/vpx_dsp/arm/variance_neon.c", "libvpx/vpx_dsp/arm/vpx_convolve_neon.c", - "libvpx/vpx_dsp/avg.c", - "libvpx/vpx_dsp/bitreader.c", - "libvpx/vpx_dsp/bitreader_buffer.c", - "libvpx/vpx_dsp/bitwriter.c", - "libvpx/vpx_dsp/bitwriter_buffer.c", - "libvpx/vpx_dsp/fwd_txfm.c", - "libvpx/vpx_dsp/intrapred.c", - "libvpx/vpx_dsp/inv_txfm.c", - "libvpx/vpx_dsp/loopfilter.c", - "libvpx/vpx_dsp/prob.c", - "libvpx/vpx_dsp/psnr.c", - "libvpx/vpx_dsp/quantize.c", - "libvpx/vpx_dsp/sad.c", - "libvpx/vpx_dsp/subtract.c", - "libvpx/vpx_dsp/sum_squares.c", - "libvpx/vpx_dsp/variance.c", - "libvpx/vpx_dsp/vpx_convolve.c", - "libvpx/vpx_dsp/vpx_dsp_rtcd.c", - "libvpx/vpx_mem/vpx_mem.c", - "libvpx/vpx_ports/arm_cpudetect.c", - "libvpx/vpx_scale/generic/gen_scalers.c", - "libvpx/vpx_scale/generic/vpx_scale.c", - "libvpx/vpx_scale/generic/yv12config.c", - "libvpx/vpx_scale/generic/yv12extend.c", - "libvpx/vpx_scale/vpx_scale_rtcd.c", - "libvpx/vpx_util/vpx_thread.c", + "libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c", "config/arm-neon/vpx_config.c", ] +libvpx_arm_neon_exclude_c_srcs = [ + "config/arm/vpx_config.c", +] + libvpx_arm_neon_asm_srcs = [ "libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm", "libvpx/vpx_dsp/arm/idct4x4_add_neon.asm", @@ -240,6 +111,7 @@ libvpx_arm_c_srcs = [ "libvpx/vp8/common/swapyv12buffer.c", "libvpx/vp8/common/treecoder.c", "libvpx/vp8/common/vp8_loopfilter.c", + "libvpx/vp8/common/vp8_skin_detection.c", "libvpx/vp8/decoder/dboolhuff.c", "libvpx/vp8/decoder/decodeframe.c", "libvpx/vp8/decoder/decodemv.c", @@ -295,7 +167,6 @@ libvpx_arm_c_srcs = [ "libvpx/vp9/decoder/vp9_decoder.c", "libvpx/vp9/decoder/vp9_detokenize.c", "libvpx/vp9/decoder/vp9_dsubexp.c", - "libvpx/vp9/decoder/vp9_dthread.c", "libvpx/vp9/encoder/vp9_alt_ref_aq.c", "libvpx/vp9/encoder/vp9_aq_360.c", "libvpx/vp9/encoder/vp9_aq_complexity.c", @@ -311,10 +182,8 @@ libvpx_arm_c_srcs = [ "libvpx/vp9/encoder/vp9_encoder.c", "libvpx/vp9/encoder/vp9_ethread.c", "libvpx/vp9/encoder/vp9_extend.c", - "libvpx/vp9/encoder/vp9_firstpass.c", "libvpx/vp9/encoder/vp9_frame_scale.c", "libvpx/vp9/encoder/vp9_lookahead.c", - "libvpx/vp9/encoder/vp9_mbgraph.c", "libvpx/vp9/encoder/vp9_mcomp.c", "libvpx/vp9/encoder/vp9_multi_thread.c", "libvpx/vp9/encoder/vp9_noise_estimate.c", @@ -330,7 +199,6 @@ libvpx_arm_c_srcs = [ "libvpx/vp9/encoder/vp9_speed_features.c", "libvpx/vp9/encoder/vp9_subexp.c", "libvpx/vp9/encoder/vp9_svc_layercontext.c", - "libvpx/vp9/encoder/vp9_temporal_filter.c", "libvpx/vp9/encoder/vp9_tokenize.c", "libvpx/vp9/encoder/vp9_treewriter.c", "libvpx/vp9/vp9_cx_iface.c", @@ -352,6 +220,7 @@ libvpx_arm_c_srcs = [ "libvpx/vpx_dsp/psnr.c", "libvpx/vpx_dsp/quantize.c", "libvpx/vpx_dsp/sad.c", + "libvpx/vpx_dsp/skin_detection.c", "libvpx/vpx_dsp/subtract.c", "libvpx/vpx_dsp/sum_squares.c", "libvpx/vpx_dsp/variance.c", @@ -365,6 +234,7 @@ libvpx_arm_c_srcs = [ "libvpx/vpx_scale/generic/yv12extend.c", "libvpx/vpx_scale/vpx_scale_rtcd.c", "libvpx/vpx_util/vpx_thread.c", + "libvpx/vpx_util/vpx_write_yuv_frame.c", "config/arm/vpx_config.c", ] @@ -410,6 +280,7 @@ libvpx_arm64_c_srcs = [ "libvpx/vp8/common/swapyv12buffer.c", "libvpx/vp8/common/treecoder.c", "libvpx/vp8/common/vp8_loopfilter.c", + "libvpx/vp8/common/vp8_skin_detection.c", "libvpx/vp8/decoder/dboolhuff.c", "libvpx/vp8/decoder/decodeframe.c", "libvpx/vp8/decoder/decodemv.c", @@ -443,8 +314,6 @@ libvpx_arm64_c_srcs = [ "libvpx/vp8/encoder/vp8_quantize.c", "libvpx/vp8/vp8_cx_iface.c", "libvpx/vp8/vp8_dx_iface.c", - "libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c", - "libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c", "libvpx/vp9/common/vp9_alloccommon.c", "libvpx/vp9/common/vp9_blockd.c", "libvpx/vp9/common/vp9_common_data.c", @@ -471,9 +340,8 @@ libvpx_arm64_c_srcs = [ "libvpx/vp9/decoder/vp9_decoder.c", "libvpx/vp9/decoder/vp9_detokenize.c", "libvpx/vp9/decoder/vp9_dsubexp.c", - "libvpx/vp9/decoder/vp9_dthread.c", "libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c", - "libvpx/vp9/encoder/arm/neon/vp9_error_neon.c", + "libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c", "libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c", "libvpx/vp9/encoder/vp9_alt_ref_aq.c", "libvpx/vp9/encoder/vp9_aq_360.c", @@ -490,10 +358,8 @@ libvpx_arm64_c_srcs = [ "libvpx/vp9/encoder/vp9_encoder.c", "libvpx/vp9/encoder/vp9_ethread.c", "libvpx/vp9/encoder/vp9_extend.c", - "libvpx/vp9/encoder/vp9_firstpass.c", "libvpx/vp9/encoder/vp9_frame_scale.c", "libvpx/vp9/encoder/vp9_lookahead.c", - "libvpx/vp9/encoder/vp9_mbgraph.c", "libvpx/vp9/encoder/vp9_mcomp.c", "libvpx/vp9/encoder/vp9_multi_thread.c", "libvpx/vp9/encoder/vp9_noise_estimate.c", @@ -509,7 +375,6 @@ libvpx_arm64_c_srcs = [ "libvpx/vp9/encoder/vp9_speed_features.c", "libvpx/vp9/encoder/vp9_subexp.c", "libvpx/vp9/encoder/vp9_svc_layercontext.c", - "libvpx/vp9/encoder/vp9_temporal_filter.c", "libvpx/vp9/encoder/vp9_tokenize.c", "libvpx/vp9/encoder/vp9_treewriter.c", "libvpx/vp9/vp9_cx_iface.c", @@ -519,9 +384,26 @@ libvpx_arm64_c_srcs = [ "libvpx/vpx/src/vpx_encoder.c", "libvpx/vpx/src/vpx_image.c", "libvpx/vpx_dsp/arm/avg_neon.c", + "libvpx/vpx_dsp/arm/avg_pred_neon.c", + "libvpx/vpx_dsp/arm/fdct16x16_neon.c", + "libvpx/vpx_dsp/arm/fdct32x32_neon.c", "libvpx/vpx_dsp/arm/fdct_neon.c", + "libvpx/vpx_dsp/arm/fdct_partial_neon.c", "libvpx/vpx_dsp/arm/fwd_txfm_neon.c", "libvpx/vpx_dsp/arm/hadamard_neon.c", + "libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c", + "libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c", + "libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c", + "libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c", + "libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c", + "libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c", + "libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c", + "libvpx/vpx_dsp/arm/highbd_intrapred_neon.c", + "libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c", + "libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c", + "libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c", + "libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c", + "libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c", "libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c", "libvpx/vpx_dsp/arm/idct16x16_add_neon.c", "libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c", @@ -534,6 +416,7 @@ libvpx_arm64_c_srcs = [ "libvpx/vpx_dsp/arm/idct8x8_add_neon.c", "libvpx/vpx_dsp/arm/intrapred_neon.c", "libvpx/vpx_dsp/arm/loopfilter_neon.c", + "libvpx/vpx_dsp/arm/quantize_neon.c", "libvpx/vpx_dsp/arm/sad4d_neon.c", "libvpx/vpx_dsp/arm/sad_neon.c", "libvpx/vpx_dsp/arm/subpel_variance_neon.c", @@ -543,6 +426,7 @@ libvpx_arm64_c_srcs = [ "libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c", "libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c", "libvpx/vpx_dsp/arm/vpx_convolve_neon.c", + "libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c", "libvpx/vpx_dsp/avg.c", "libvpx/vpx_dsp/bitreader.c", "libvpx/vpx_dsp/bitreader_buffer.c", @@ -556,6 +440,7 @@ libvpx_arm64_c_srcs = [ "libvpx/vpx_dsp/psnr.c", "libvpx/vpx_dsp/quantize.c", "libvpx/vpx_dsp/sad.c", + "libvpx/vpx_dsp/skin_detection.c", "libvpx/vpx_dsp/subtract.c", "libvpx/vpx_dsp/sum_squares.c", "libvpx/vpx_dsp/variance.c", @@ -569,6 +454,7 @@ libvpx_arm64_c_srcs = [ "libvpx/vpx_scale/generic/yv12extend.c", "libvpx/vpx_scale/vpx_scale_rtcd.c", "libvpx/vpx_util/vpx_thread.c", + "libvpx/vpx_util/vpx_write_yuv_frame.c", "config/arm64/vpx_config.c", ] @@ -598,6 +484,7 @@ libvpx_generic_c_srcs = [ "libvpx/vp8/common/swapyv12buffer.c", "libvpx/vp8/common/treecoder.c", "libvpx/vp8/common/vp8_loopfilter.c", + "libvpx/vp8/common/vp8_skin_detection.c", "libvpx/vp8/decoder/dboolhuff.c", "libvpx/vp8/decoder/decodeframe.c", "libvpx/vp8/decoder/decodemv.c", @@ -653,7 +540,6 @@ libvpx_generic_c_srcs = [ "libvpx/vp9/decoder/vp9_decoder.c", "libvpx/vp9/decoder/vp9_detokenize.c", "libvpx/vp9/decoder/vp9_dsubexp.c", - "libvpx/vp9/decoder/vp9_dthread.c", "libvpx/vp9/encoder/vp9_alt_ref_aq.c", "libvpx/vp9/encoder/vp9_aq_360.c", "libvpx/vp9/encoder/vp9_aq_complexity.c", @@ -669,10 +555,8 @@ libvpx_generic_c_srcs = [ "libvpx/vp9/encoder/vp9_encoder.c", "libvpx/vp9/encoder/vp9_ethread.c", "libvpx/vp9/encoder/vp9_extend.c", - "libvpx/vp9/encoder/vp9_firstpass.c", "libvpx/vp9/encoder/vp9_frame_scale.c", "libvpx/vp9/encoder/vp9_lookahead.c", - "libvpx/vp9/encoder/vp9_mbgraph.c", "libvpx/vp9/encoder/vp9_mcomp.c", "libvpx/vp9/encoder/vp9_multi_thread.c", "libvpx/vp9/encoder/vp9_noise_estimate.c", @@ -688,7 +572,6 @@ libvpx_generic_c_srcs = [ "libvpx/vp9/encoder/vp9_speed_features.c", "libvpx/vp9/encoder/vp9_subexp.c", "libvpx/vp9/encoder/vp9_svc_layercontext.c", - "libvpx/vp9/encoder/vp9_temporal_filter.c", "libvpx/vp9/encoder/vp9_tokenize.c", "libvpx/vp9/encoder/vp9_treewriter.c", "libvpx/vp9/vp9_cx_iface.c", @@ -710,6 +593,7 @@ libvpx_generic_c_srcs = [ "libvpx/vpx_dsp/psnr.c", "libvpx/vpx_dsp/quantize.c", "libvpx/vpx_dsp/sad.c", + "libvpx/vpx_dsp/skin_detection.c", "libvpx/vpx_dsp/subtract.c", "libvpx/vpx_dsp/sum_squares.c", "libvpx/vpx_dsp/variance.c", @@ -722,152 +606,17 @@ libvpx_generic_c_srcs = [ "libvpx/vpx_scale/generic/yv12extend.c", "libvpx/vpx_scale/vpx_scale_rtcd.c", "libvpx/vpx_util/vpx_thread.c", + "libvpx/vpx_util/vpx_write_yuv_frame.c", "config/generic/vpx_config.c", ] libvpx_mips32_dspr2_c_srcs = [ - "libvpx/vp8/common/alloccommon.c", - "libvpx/vp8/common/blockd.c", - "libvpx/vp8/common/copy_c.c", - "libvpx/vp8/common/dequantize.c", - "libvpx/vp8/common/entropy.c", - "libvpx/vp8/common/entropymode.c", - "libvpx/vp8/common/entropymv.c", - "libvpx/vp8/common/extend.c", - "libvpx/vp8/common/filter.c", - "libvpx/vp8/common/findnearmv.c", - "libvpx/vp8/common/generic/systemdependent.c", - "libvpx/vp8/common/idct_blk.c", - "libvpx/vp8/common/idctllm.c", - "libvpx/vp8/common/loopfilter_filters.c", - "libvpx/vp8/common/mbpitch.c", "libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c", "libvpx/vp8/common/mips/dspr2/filter_dspr2.c", "libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c", "libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c", "libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c", "libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c", - "libvpx/vp8/common/modecont.c", - "libvpx/vp8/common/quant_common.c", - "libvpx/vp8/common/reconinter.c", - "libvpx/vp8/common/reconintra.c", - "libvpx/vp8/common/reconintra4x4.c", - "libvpx/vp8/common/rtcd.c", - "libvpx/vp8/common/setupintrarecon.c", - "libvpx/vp8/common/swapyv12buffer.c", - "libvpx/vp8/common/treecoder.c", - "libvpx/vp8/common/vp8_loopfilter.c", - "libvpx/vp8/decoder/dboolhuff.c", - "libvpx/vp8/decoder/decodeframe.c", - "libvpx/vp8/decoder/decodemv.c", - "libvpx/vp8/decoder/detokenize.c", - "libvpx/vp8/decoder/onyxd_if.c", - "libvpx/vp8/decoder/threading.c", - "libvpx/vp8/encoder/bitstream.c", - "libvpx/vp8/encoder/boolhuff.c", - "libvpx/vp8/encoder/dct.c", - "libvpx/vp8/encoder/denoising.c", - "libvpx/vp8/encoder/encodeframe.c", - "libvpx/vp8/encoder/encodeintra.c", - "libvpx/vp8/encoder/encodemb.c", - "libvpx/vp8/encoder/encodemv.c", - "libvpx/vp8/encoder/ethreading.c", - "libvpx/vp8/encoder/lookahead.c", - "libvpx/vp8/encoder/mcomp.c", - "libvpx/vp8/encoder/modecosts.c", - "libvpx/vp8/encoder/onyx_if.c", - "libvpx/vp8/encoder/pickinter.c", - "libvpx/vp8/encoder/picklpf.c", - "libvpx/vp8/encoder/ratectrl.c", - "libvpx/vp8/encoder/rdopt.c", - "libvpx/vp8/encoder/segmentation.c", - "libvpx/vp8/encoder/tokenize.c", - "libvpx/vp8/encoder/treewriter.c", - "libvpx/vp8/encoder/vp8_quantize.c", - "libvpx/vp8/vp8_cx_iface.c", - "libvpx/vp8/vp8_dx_iface.c", - "libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c", - "libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c", - "libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c", - "libvpx/vp9/common/vp9_alloccommon.c", - "libvpx/vp9/common/vp9_blockd.c", - "libvpx/vp9/common/vp9_common_data.c", - "libvpx/vp9/common/vp9_entropy.c", - "libvpx/vp9/common/vp9_entropymode.c", - "libvpx/vp9/common/vp9_entropymv.c", - "libvpx/vp9/common/vp9_filter.c", - "libvpx/vp9/common/vp9_frame_buffers.c", - "libvpx/vp9/common/vp9_idct.c", - "libvpx/vp9/common/vp9_loopfilter.c", - "libvpx/vp9/common/vp9_mvref_common.c", - "libvpx/vp9/common/vp9_pred_common.c", - "libvpx/vp9/common/vp9_quant_common.c", - "libvpx/vp9/common/vp9_reconinter.c", - "libvpx/vp9/common/vp9_reconintra.c", - "libvpx/vp9/common/vp9_rtcd.c", - "libvpx/vp9/common/vp9_scale.c", - "libvpx/vp9/common/vp9_scan.c", - "libvpx/vp9/common/vp9_seg_common.c", - "libvpx/vp9/common/vp9_thread_common.c", - "libvpx/vp9/common/vp9_tile_common.c", - "libvpx/vp9/decoder/vp9_decodeframe.c", - "libvpx/vp9/decoder/vp9_decodemv.c", - "libvpx/vp9/decoder/vp9_decoder.c", - "libvpx/vp9/decoder/vp9_detokenize.c", - "libvpx/vp9/decoder/vp9_dsubexp.c", - "libvpx/vp9/decoder/vp9_dthread.c", - "libvpx/vp9/encoder/vp9_alt_ref_aq.c", - "libvpx/vp9/encoder/vp9_aq_360.c", - "libvpx/vp9/encoder/vp9_aq_complexity.c", - "libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c", - "libvpx/vp9/encoder/vp9_aq_variance.c", - "libvpx/vp9/encoder/vp9_bitstream.c", - "libvpx/vp9/encoder/vp9_context_tree.c", - "libvpx/vp9/encoder/vp9_cost.c", - "libvpx/vp9/encoder/vp9_dct.c", - "libvpx/vp9/encoder/vp9_encodeframe.c", - "libvpx/vp9/encoder/vp9_encodemb.c", - "libvpx/vp9/encoder/vp9_encodemv.c", - "libvpx/vp9/encoder/vp9_encoder.c", - "libvpx/vp9/encoder/vp9_ethread.c", - "libvpx/vp9/encoder/vp9_extend.c", - "libvpx/vp9/encoder/vp9_firstpass.c", - "libvpx/vp9/encoder/vp9_frame_scale.c", - "libvpx/vp9/encoder/vp9_lookahead.c", - "libvpx/vp9/encoder/vp9_mbgraph.c", - "libvpx/vp9/encoder/vp9_mcomp.c", - "libvpx/vp9/encoder/vp9_multi_thread.c", - "libvpx/vp9/encoder/vp9_noise_estimate.c", - "libvpx/vp9/encoder/vp9_picklpf.c", - "libvpx/vp9/encoder/vp9_pickmode.c", - "libvpx/vp9/encoder/vp9_quantize.c", - "libvpx/vp9/encoder/vp9_ratectrl.c", - "libvpx/vp9/encoder/vp9_rd.c", - "libvpx/vp9/encoder/vp9_rdopt.c", - "libvpx/vp9/encoder/vp9_resize.c", - "libvpx/vp9/encoder/vp9_segmentation.c", - "libvpx/vp9/encoder/vp9_skin_detection.c", - "libvpx/vp9/encoder/vp9_speed_features.c", - "libvpx/vp9/encoder/vp9_subexp.c", - "libvpx/vp9/encoder/vp9_svc_layercontext.c", - "libvpx/vp9/encoder/vp9_temporal_filter.c", - "libvpx/vp9/encoder/vp9_tokenize.c", - "libvpx/vp9/encoder/vp9_treewriter.c", - "libvpx/vp9/vp9_cx_iface.c", - "libvpx/vp9/vp9_dx_iface.c", - "libvpx/vpx/src/vpx_codec.c", - "libvpx/vpx/src/vpx_decoder.c", - "libvpx/vpx/src/vpx_encoder.c", - "libvpx/vpx/src/vpx_image.c", - "libvpx/vpx_dsp/avg.c", - "libvpx/vpx_dsp/bitreader.c", - "libvpx/vpx_dsp/bitreader_buffer.c", - "libvpx/vpx_dsp/bitwriter.c", - "libvpx/vpx_dsp/bitwriter_buffer.c", - "libvpx/vpx_dsp/fwd_txfm.c", - "libvpx/vpx_dsp/intrapred.c", - "libvpx/vpx_dsp/inv_txfm.c", - "libvpx/vpx_dsp/loopfilter.c", "libvpx/vpx_dsp/mips/common_dspr2.c", "libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c", "libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c", @@ -882,192 +631,38 @@ libvpx_mips32_dspr2_c_srcs = [ "libvpx/vpx_dsp/mips/intrapred16_dspr2.c", "libvpx/vpx_dsp/mips/intrapred4_dspr2.c", "libvpx/vpx_dsp/mips/intrapred8_dspr2.c", - "libvpx/vpx_dsp/mips/itrans16_dspr2.c", - "libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c", - "libvpx/vpx_dsp/mips/itrans32_dspr2.c", - "libvpx/vpx_dsp/mips/itrans4_dspr2.c", - "libvpx/vpx_dsp/mips/itrans8_dspr2.c", "libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.c", "libvpx/vpx_dsp/mips/loopfilter_mb_dspr2.c", "libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c", "libvpx/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c", - "libvpx/vpx_dsp/prob.c", - "libvpx/vpx_dsp/psnr.c", - "libvpx/vpx_dsp/quantize.c", - "libvpx/vpx_dsp/sad.c", - "libvpx/vpx_dsp/subtract.c", - "libvpx/vpx_dsp/sum_squares.c", - "libvpx/vpx_dsp/variance.c", - "libvpx/vpx_dsp/vpx_convolve.c", - "libvpx/vpx_dsp/vpx_dsp_rtcd.c", - "libvpx/vpx_mem/vpx_mem.c", - "libvpx/vpx_scale/generic/gen_scalers.c", - "libvpx/vpx_scale/generic/vpx_scale.c", - "libvpx/vpx_scale/generic/yv12config.c", - "libvpx/vpx_scale/generic/yv12extend.c", "libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c", - "libvpx/vpx_scale/vpx_scale_rtcd.c", - "libvpx/vpx_util/vpx_thread.c", "config/mips32-dspr2/vpx_config.c", ] +libvpx_mips32_dspr2_exclude_c_srcs = [ + "config/mips32/vpx_config.c", +] + libvpx_mips32_msa_c_srcs = [ - "libvpx/vp8/common/alloccommon.c", - "libvpx/vp8/common/blockd.c", - "libvpx/vp8/common/copy_c.c", - "libvpx/vp8/common/dequantize.c", - "libvpx/vp8/common/entropy.c", - "libvpx/vp8/common/entropymode.c", - "libvpx/vp8/common/entropymv.c", - "libvpx/vp8/common/extend.c", - "libvpx/vp8/common/filter.c", - "libvpx/vp8/common/findnearmv.c", - "libvpx/vp8/common/generic/systemdependent.c", - "libvpx/vp8/common/idct_blk.c", - "libvpx/vp8/common/idctllm.c", - "libvpx/vp8/common/loopfilter_filters.c", - "libvpx/vp8/common/mbpitch.c", "libvpx/vp8/common/mips/msa/bilinear_filter_msa.c", "libvpx/vp8/common/mips/msa/copymem_msa.c", "libvpx/vp8/common/mips/msa/idct_msa.c", "libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c", "libvpx/vp8/common/mips/msa/sixtap_filter_msa.c", - "libvpx/vp8/common/modecont.c", - "libvpx/vp8/common/quant_common.c", - "libvpx/vp8/common/reconinter.c", - "libvpx/vp8/common/reconintra.c", - "libvpx/vp8/common/reconintra4x4.c", - "libvpx/vp8/common/rtcd.c", - "libvpx/vp8/common/setupintrarecon.c", - "libvpx/vp8/common/swapyv12buffer.c", - "libvpx/vp8/common/treecoder.c", - "libvpx/vp8/common/vp8_loopfilter.c", - "libvpx/vp8/decoder/dboolhuff.c", - "libvpx/vp8/decoder/decodeframe.c", - "libvpx/vp8/decoder/decodemv.c", - "libvpx/vp8/decoder/detokenize.c", - "libvpx/vp8/decoder/onyxd_if.c", - "libvpx/vp8/decoder/threading.c", - "libvpx/vp8/encoder/bitstream.c", - "libvpx/vp8/encoder/boolhuff.c", - "libvpx/vp8/encoder/dct.c", - "libvpx/vp8/encoder/denoising.c", - "libvpx/vp8/encoder/encodeframe.c", - "libvpx/vp8/encoder/encodeintra.c", - "libvpx/vp8/encoder/encodemb.c", - "libvpx/vp8/encoder/encodemv.c", - "libvpx/vp8/encoder/ethreading.c", - "libvpx/vp8/encoder/lookahead.c", - "libvpx/vp8/encoder/mcomp.c", "libvpx/vp8/encoder/mips/msa/dct_msa.c", "libvpx/vp8/encoder/mips/msa/denoising_msa.c", "libvpx/vp8/encoder/mips/msa/encodeopt_msa.c", "libvpx/vp8/encoder/mips/msa/quantize_msa.c", - "libvpx/vp8/encoder/modecosts.c", - "libvpx/vp8/encoder/onyx_if.c", - "libvpx/vp8/encoder/pickinter.c", - "libvpx/vp8/encoder/picklpf.c", - "libvpx/vp8/encoder/ratectrl.c", - "libvpx/vp8/encoder/rdopt.c", - "libvpx/vp8/encoder/segmentation.c", - "libvpx/vp8/encoder/tokenize.c", - "libvpx/vp8/encoder/treewriter.c", - "libvpx/vp8/encoder/vp8_quantize.c", - "libvpx/vp8/vp8_cx_iface.c", - "libvpx/vp8/vp8_dx_iface.c", "libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c", "libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c", "libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c", - "libvpx/vp9/common/vp9_alloccommon.c", - "libvpx/vp9/common/vp9_blockd.c", - "libvpx/vp9/common/vp9_common_data.c", - "libvpx/vp9/common/vp9_entropy.c", - "libvpx/vp9/common/vp9_entropymode.c", - "libvpx/vp9/common/vp9_entropymv.c", - "libvpx/vp9/common/vp9_filter.c", - "libvpx/vp9/common/vp9_frame_buffers.c", - "libvpx/vp9/common/vp9_idct.c", - "libvpx/vp9/common/vp9_loopfilter.c", - "libvpx/vp9/common/vp9_mvref_common.c", - "libvpx/vp9/common/vp9_pred_common.c", - "libvpx/vp9/common/vp9_quant_common.c", - "libvpx/vp9/common/vp9_reconinter.c", - "libvpx/vp9/common/vp9_reconintra.c", - "libvpx/vp9/common/vp9_rtcd.c", - "libvpx/vp9/common/vp9_scale.c", - "libvpx/vp9/common/vp9_scan.c", - "libvpx/vp9/common/vp9_seg_common.c", - "libvpx/vp9/common/vp9_thread_common.c", - "libvpx/vp9/common/vp9_tile_common.c", - "libvpx/vp9/decoder/vp9_decodeframe.c", - "libvpx/vp9/decoder/vp9_decodemv.c", - "libvpx/vp9/decoder/vp9_decoder.c", - "libvpx/vp9/decoder/vp9_detokenize.c", - "libvpx/vp9/decoder/vp9_dsubexp.c", - "libvpx/vp9/decoder/vp9_dthread.c", "libvpx/vp9/encoder/mips/msa/vp9_error_msa.c", "libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c", "libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c", "libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c", - "libvpx/vp9/encoder/vp9_alt_ref_aq.c", - "libvpx/vp9/encoder/vp9_aq_360.c", - "libvpx/vp9/encoder/vp9_aq_complexity.c", - "libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c", - "libvpx/vp9/encoder/vp9_aq_variance.c", - "libvpx/vp9/encoder/vp9_bitstream.c", - "libvpx/vp9/encoder/vp9_context_tree.c", - "libvpx/vp9/encoder/vp9_cost.c", - "libvpx/vp9/encoder/vp9_dct.c", - "libvpx/vp9/encoder/vp9_encodeframe.c", - "libvpx/vp9/encoder/vp9_encodemb.c", - "libvpx/vp9/encoder/vp9_encodemv.c", - "libvpx/vp9/encoder/vp9_encoder.c", - "libvpx/vp9/encoder/vp9_ethread.c", - "libvpx/vp9/encoder/vp9_extend.c", - "libvpx/vp9/encoder/vp9_firstpass.c", - "libvpx/vp9/encoder/vp9_frame_scale.c", - "libvpx/vp9/encoder/vp9_lookahead.c", - "libvpx/vp9/encoder/vp9_mbgraph.c", - "libvpx/vp9/encoder/vp9_mcomp.c", - "libvpx/vp9/encoder/vp9_multi_thread.c", - "libvpx/vp9/encoder/vp9_noise_estimate.c", - "libvpx/vp9/encoder/vp9_picklpf.c", - "libvpx/vp9/encoder/vp9_pickmode.c", - "libvpx/vp9/encoder/vp9_quantize.c", - "libvpx/vp9/encoder/vp9_ratectrl.c", - "libvpx/vp9/encoder/vp9_rd.c", - "libvpx/vp9/encoder/vp9_rdopt.c", - "libvpx/vp9/encoder/vp9_resize.c", - "libvpx/vp9/encoder/vp9_segmentation.c", - "libvpx/vp9/encoder/vp9_skin_detection.c", - "libvpx/vp9/encoder/vp9_speed_features.c", - "libvpx/vp9/encoder/vp9_subexp.c", - "libvpx/vp9/encoder/vp9_svc_layercontext.c", - "libvpx/vp9/encoder/vp9_temporal_filter.c", - "libvpx/vp9/encoder/vp9_tokenize.c", - "libvpx/vp9/encoder/vp9_treewriter.c", - "libvpx/vp9/vp9_cx_iface.c", - "libvpx/vp9/vp9_dx_iface.c", - "libvpx/vpx/src/vpx_codec.c", - "libvpx/vpx/src/vpx_decoder.c", - "libvpx/vpx/src/vpx_encoder.c", - "libvpx/vpx/src/vpx_image.c", - "libvpx/vpx_dsp/avg.c", - "libvpx/vpx_dsp/bitreader.c", - "libvpx/vpx_dsp/bitreader_buffer.c", - "libvpx/vpx_dsp/bitwriter.c", - "libvpx/vpx_dsp/bitwriter_buffer.c", - "libvpx/vpx_dsp/fwd_txfm.c", - "libvpx/vpx_dsp/intrapred.c", - "libvpx/vpx_dsp/inv_txfm.c", - "libvpx/vpx_dsp/loopfilter.c", "libvpx/vpx_dsp/mips/avg_msa.c", "libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c", "libvpx/vpx_dsp/mips/fwd_txfm_msa.c", - "libvpx/vpx_dsp/mips/idct16x16_msa.c", - "libvpx/vpx_dsp/mips/idct32x32_msa.c", - "libvpx/vpx_dsp/mips/idct4x4_msa.c", - "libvpx/vpx_dsp/mips/idct8x8_msa.c", "libvpx/vpx_dsp/mips/intrapred_msa.c", "libvpx/vpx_dsp/mips/loopfilter_16_msa.c", "libvpx/vpx_dsp/mips/loopfilter_4_msa.c", @@ -1085,25 +680,13 @@ libvpx_mips32_msa_c_srcs = [ "libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c", "libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c", "libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c", - "libvpx/vpx_dsp/prob.c", - "libvpx/vpx_dsp/psnr.c", - "libvpx/vpx_dsp/quantize.c", - "libvpx/vpx_dsp/sad.c", - "libvpx/vpx_dsp/subtract.c", - "libvpx/vpx_dsp/sum_squares.c", - "libvpx/vpx_dsp/variance.c", - "libvpx/vpx_dsp/vpx_convolve.c", - "libvpx/vpx_dsp/vpx_dsp_rtcd.c", - "libvpx/vpx_mem/vpx_mem.c", - "libvpx/vpx_scale/generic/gen_scalers.c", - "libvpx/vpx_scale/generic/vpx_scale.c", - "libvpx/vpx_scale/generic/yv12config.c", - "libvpx/vpx_scale/generic/yv12extend.c", - "libvpx/vpx_scale/vpx_scale_rtcd.c", - "libvpx/vpx_util/vpx_thread.c", "config/mips32-msa/vpx_config.c", ] +libvpx_mips32_msa_exclude_c_srcs = [ + "config/mips32/vpx_config.c", +] + libvpx_mips32_c_srcs = [ "libvpx/vp8/common/alloccommon.c", "libvpx/vp8/common/blockd.c", @@ -1130,6 +713,7 @@ libvpx_mips32_c_srcs = [ "libvpx/vp8/common/swapyv12buffer.c", "libvpx/vp8/common/treecoder.c", "libvpx/vp8/common/vp8_loopfilter.c", + "libvpx/vp8/common/vp8_skin_detection.c", "libvpx/vp8/decoder/dboolhuff.c", "libvpx/vp8/decoder/decodeframe.c", "libvpx/vp8/decoder/decodemv.c", @@ -1185,7 +769,6 @@ libvpx_mips32_c_srcs = [ "libvpx/vp9/decoder/vp9_decoder.c", "libvpx/vp9/decoder/vp9_detokenize.c", "libvpx/vp9/decoder/vp9_dsubexp.c", - "libvpx/vp9/decoder/vp9_dthread.c", "libvpx/vp9/encoder/vp9_alt_ref_aq.c", "libvpx/vp9/encoder/vp9_aq_360.c", "libvpx/vp9/encoder/vp9_aq_complexity.c", @@ -1201,10 +784,8 @@ libvpx_mips32_c_srcs = [ "libvpx/vp9/encoder/vp9_encoder.c", "libvpx/vp9/encoder/vp9_ethread.c", "libvpx/vp9/encoder/vp9_extend.c", - "libvpx/vp9/encoder/vp9_firstpass.c", "libvpx/vp9/encoder/vp9_frame_scale.c", "libvpx/vp9/encoder/vp9_lookahead.c", - "libvpx/vp9/encoder/vp9_mbgraph.c", "libvpx/vp9/encoder/vp9_mcomp.c", "libvpx/vp9/encoder/vp9_multi_thread.c", "libvpx/vp9/encoder/vp9_noise_estimate.c", @@ -1220,7 +801,6 @@ libvpx_mips32_c_srcs = [ "libvpx/vp9/encoder/vp9_speed_features.c", "libvpx/vp9/encoder/vp9_subexp.c", "libvpx/vp9/encoder/vp9_svc_layercontext.c", - "libvpx/vp9/encoder/vp9_temporal_filter.c", "libvpx/vp9/encoder/vp9_tokenize.c", "libvpx/vp9/encoder/vp9_treewriter.c", "libvpx/vp9/vp9_cx_iface.c", @@ -1242,6 +822,7 @@ libvpx_mips32_c_srcs = [ "libvpx/vpx_dsp/psnr.c", "libvpx/vpx_dsp/quantize.c", "libvpx/vpx_dsp/sad.c", + "libvpx/vpx_dsp/skin_detection.c", "libvpx/vpx_dsp/subtract.c", "libvpx/vpx_dsp/sum_squares.c", "libvpx/vpx_dsp/variance.c", @@ -1254,166 +835,30 @@ libvpx_mips32_c_srcs = [ "libvpx/vpx_scale/generic/yv12extend.c", "libvpx/vpx_scale/vpx_scale_rtcd.c", "libvpx/vpx_util/vpx_thread.c", + "libvpx/vpx_util/vpx_write_yuv_frame.c", "config/mips32/vpx_config.c", ] libvpx_mips64_msa_c_srcs = [ - "libvpx/vp8/common/alloccommon.c", - "libvpx/vp8/common/blockd.c", - "libvpx/vp8/common/copy_c.c", - "libvpx/vp8/common/dequantize.c", - "libvpx/vp8/common/entropy.c", - "libvpx/vp8/common/entropymode.c", - "libvpx/vp8/common/entropymv.c", - "libvpx/vp8/common/extend.c", - "libvpx/vp8/common/filter.c", - "libvpx/vp8/common/findnearmv.c", - "libvpx/vp8/common/generic/systemdependent.c", - "libvpx/vp8/common/idct_blk.c", - "libvpx/vp8/common/idctllm.c", - "libvpx/vp8/common/loopfilter_filters.c", - "libvpx/vp8/common/mbpitch.c", "libvpx/vp8/common/mips/msa/bilinear_filter_msa.c", "libvpx/vp8/common/mips/msa/copymem_msa.c", "libvpx/vp8/common/mips/msa/idct_msa.c", "libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c", "libvpx/vp8/common/mips/msa/sixtap_filter_msa.c", - "libvpx/vp8/common/modecont.c", - "libvpx/vp8/common/quant_common.c", - "libvpx/vp8/common/reconinter.c", - "libvpx/vp8/common/reconintra.c", - "libvpx/vp8/common/reconintra4x4.c", - "libvpx/vp8/common/rtcd.c", - "libvpx/vp8/common/setupintrarecon.c", - "libvpx/vp8/common/swapyv12buffer.c", - "libvpx/vp8/common/treecoder.c", - "libvpx/vp8/common/vp8_loopfilter.c", - "libvpx/vp8/decoder/dboolhuff.c", - "libvpx/vp8/decoder/decodeframe.c", - "libvpx/vp8/decoder/decodemv.c", - "libvpx/vp8/decoder/detokenize.c", - "libvpx/vp8/decoder/onyxd_if.c", - "libvpx/vp8/decoder/threading.c", - "libvpx/vp8/encoder/bitstream.c", - "libvpx/vp8/encoder/boolhuff.c", - "libvpx/vp8/encoder/dct.c", - "libvpx/vp8/encoder/denoising.c", - "libvpx/vp8/encoder/encodeframe.c", - "libvpx/vp8/encoder/encodeintra.c", - "libvpx/vp8/encoder/encodemb.c", - "libvpx/vp8/encoder/encodemv.c", - "libvpx/vp8/encoder/ethreading.c", - "libvpx/vp8/encoder/lookahead.c", - "libvpx/vp8/encoder/mcomp.c", "libvpx/vp8/encoder/mips/msa/dct_msa.c", "libvpx/vp8/encoder/mips/msa/denoising_msa.c", "libvpx/vp8/encoder/mips/msa/encodeopt_msa.c", "libvpx/vp8/encoder/mips/msa/quantize_msa.c", - "libvpx/vp8/encoder/modecosts.c", - "libvpx/vp8/encoder/onyx_if.c", - "libvpx/vp8/encoder/pickinter.c", - "libvpx/vp8/encoder/picklpf.c", - "libvpx/vp8/encoder/ratectrl.c", - "libvpx/vp8/encoder/rdopt.c", - "libvpx/vp8/encoder/segmentation.c", - "libvpx/vp8/encoder/tokenize.c", - "libvpx/vp8/encoder/treewriter.c", - "libvpx/vp8/encoder/vp8_quantize.c", - "libvpx/vp8/vp8_cx_iface.c", - "libvpx/vp8/vp8_dx_iface.c", "libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c", "libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c", "libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c", - "libvpx/vp9/common/vp9_alloccommon.c", - "libvpx/vp9/common/vp9_blockd.c", - "libvpx/vp9/common/vp9_common_data.c", - "libvpx/vp9/common/vp9_entropy.c", - "libvpx/vp9/common/vp9_entropymode.c", - "libvpx/vp9/common/vp9_entropymv.c", - "libvpx/vp9/common/vp9_filter.c", - "libvpx/vp9/common/vp9_frame_buffers.c", - "libvpx/vp9/common/vp9_idct.c", - "libvpx/vp9/common/vp9_loopfilter.c", - "libvpx/vp9/common/vp9_mvref_common.c", - "libvpx/vp9/common/vp9_pred_common.c", - "libvpx/vp9/common/vp9_quant_common.c", - "libvpx/vp9/common/vp9_reconinter.c", - "libvpx/vp9/common/vp9_reconintra.c", - "libvpx/vp9/common/vp9_rtcd.c", - "libvpx/vp9/common/vp9_scale.c", - "libvpx/vp9/common/vp9_scan.c", - "libvpx/vp9/common/vp9_seg_common.c", - "libvpx/vp9/common/vp9_thread_common.c", - "libvpx/vp9/common/vp9_tile_common.c", - "libvpx/vp9/decoder/vp9_decodeframe.c", - "libvpx/vp9/decoder/vp9_decodemv.c", - "libvpx/vp9/decoder/vp9_decoder.c", - "libvpx/vp9/decoder/vp9_detokenize.c", - "libvpx/vp9/decoder/vp9_dsubexp.c", - "libvpx/vp9/decoder/vp9_dthread.c", "libvpx/vp9/encoder/mips/msa/vp9_error_msa.c", "libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c", "libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c", "libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c", - "libvpx/vp9/encoder/vp9_alt_ref_aq.c", - "libvpx/vp9/encoder/vp9_aq_360.c", - "libvpx/vp9/encoder/vp9_aq_complexity.c", - "libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c", - "libvpx/vp9/encoder/vp9_aq_variance.c", - "libvpx/vp9/encoder/vp9_bitstream.c", - "libvpx/vp9/encoder/vp9_context_tree.c", - "libvpx/vp9/encoder/vp9_cost.c", - "libvpx/vp9/encoder/vp9_dct.c", - "libvpx/vp9/encoder/vp9_encodeframe.c", - "libvpx/vp9/encoder/vp9_encodemb.c", - "libvpx/vp9/encoder/vp9_encodemv.c", - "libvpx/vp9/encoder/vp9_encoder.c", - "libvpx/vp9/encoder/vp9_ethread.c", - "libvpx/vp9/encoder/vp9_extend.c", - "libvpx/vp9/encoder/vp9_firstpass.c", - "libvpx/vp9/encoder/vp9_frame_scale.c", - "libvpx/vp9/encoder/vp9_lookahead.c", - "libvpx/vp9/encoder/vp9_mbgraph.c", - "libvpx/vp9/encoder/vp9_mcomp.c", - "libvpx/vp9/encoder/vp9_multi_thread.c", - "libvpx/vp9/encoder/vp9_noise_estimate.c", - "libvpx/vp9/encoder/vp9_picklpf.c", - "libvpx/vp9/encoder/vp9_pickmode.c", - "libvpx/vp9/encoder/vp9_quantize.c", - "libvpx/vp9/encoder/vp9_ratectrl.c", - "libvpx/vp9/encoder/vp9_rd.c", - "libvpx/vp9/encoder/vp9_rdopt.c", - "libvpx/vp9/encoder/vp9_resize.c", - "libvpx/vp9/encoder/vp9_segmentation.c", - "libvpx/vp9/encoder/vp9_skin_detection.c", - "libvpx/vp9/encoder/vp9_speed_features.c", - "libvpx/vp9/encoder/vp9_subexp.c", - "libvpx/vp9/encoder/vp9_svc_layercontext.c", - "libvpx/vp9/encoder/vp9_temporal_filter.c", - "libvpx/vp9/encoder/vp9_tokenize.c", - "libvpx/vp9/encoder/vp9_treewriter.c", - "libvpx/vp9/vp9_cx_iface.c", - "libvpx/vp9/vp9_dx_iface.c", - "libvpx/vpx/src/vpx_codec.c", - "libvpx/vpx/src/vpx_decoder.c", - "libvpx/vpx/src/vpx_encoder.c", - "libvpx/vpx/src/vpx_image.c", - "libvpx/vpx_dsp/avg.c", - "libvpx/vpx_dsp/bitreader.c", - "libvpx/vpx_dsp/bitreader_buffer.c", - "libvpx/vpx_dsp/bitwriter.c", - "libvpx/vpx_dsp/bitwriter_buffer.c", - "libvpx/vpx_dsp/fwd_txfm.c", - "libvpx/vpx_dsp/intrapred.c", - "libvpx/vpx_dsp/inv_txfm.c", - "libvpx/vpx_dsp/loopfilter.c", "libvpx/vpx_dsp/mips/avg_msa.c", "libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c", "libvpx/vpx_dsp/mips/fwd_txfm_msa.c", - "libvpx/vpx_dsp/mips/idct16x16_msa.c", - "libvpx/vpx_dsp/mips/idct32x32_msa.c", - "libvpx/vpx_dsp/mips/idct4x4_msa.c", - "libvpx/vpx_dsp/mips/idct8x8_msa.c", "libvpx/vpx_dsp/mips/intrapred_msa.c", "libvpx/vpx_dsp/mips/loopfilter_16_msa.c", "libvpx/vpx_dsp/mips/loopfilter_4_msa.c", @@ -1431,25 +876,13 @@ libvpx_mips64_msa_c_srcs = [ "libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c", "libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c", "libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c", - "libvpx/vpx_dsp/prob.c", - "libvpx/vpx_dsp/psnr.c", - "libvpx/vpx_dsp/quantize.c", - "libvpx/vpx_dsp/sad.c", - "libvpx/vpx_dsp/subtract.c", - "libvpx/vpx_dsp/sum_squares.c", - "libvpx/vpx_dsp/variance.c", - "libvpx/vpx_dsp/vpx_convolve.c", - "libvpx/vpx_dsp/vpx_dsp_rtcd.c", - "libvpx/vpx_mem/vpx_mem.c", - "libvpx/vpx_scale/generic/gen_scalers.c", - "libvpx/vpx_scale/generic/vpx_scale.c", - "libvpx/vpx_scale/generic/yv12config.c", - "libvpx/vpx_scale/generic/yv12extend.c", - "libvpx/vpx_scale/vpx_scale_rtcd.c", - "libvpx/vpx_util/vpx_thread.c", "config/mips64-msa/vpx_config.c", ] +libvpx_mips64_msa_exclude_c_srcs = [ + "config/mips64/vpx_config.c", +] + libvpx_mips64_c_srcs = [ "libvpx/vp8/common/alloccommon.c", "libvpx/vp8/common/blockd.c", @@ -1476,6 +909,7 @@ libvpx_mips64_c_srcs = [ "libvpx/vp8/common/swapyv12buffer.c", "libvpx/vp8/common/treecoder.c", "libvpx/vp8/common/vp8_loopfilter.c", + "libvpx/vp8/common/vp8_skin_detection.c", "libvpx/vp8/decoder/dboolhuff.c", "libvpx/vp8/decoder/decodeframe.c", "libvpx/vp8/decoder/decodemv.c", @@ -1531,7 +965,6 @@ libvpx_mips64_c_srcs = [ "libvpx/vp9/decoder/vp9_decoder.c", "libvpx/vp9/decoder/vp9_detokenize.c", "libvpx/vp9/decoder/vp9_dsubexp.c", - "libvpx/vp9/decoder/vp9_dthread.c", "libvpx/vp9/encoder/vp9_alt_ref_aq.c", "libvpx/vp9/encoder/vp9_aq_360.c", "libvpx/vp9/encoder/vp9_aq_complexity.c", @@ -1547,10 +980,8 @@ libvpx_mips64_c_srcs = [ "libvpx/vp9/encoder/vp9_encoder.c", "libvpx/vp9/encoder/vp9_ethread.c", "libvpx/vp9/encoder/vp9_extend.c", - "libvpx/vp9/encoder/vp9_firstpass.c", "libvpx/vp9/encoder/vp9_frame_scale.c", "libvpx/vp9/encoder/vp9_lookahead.c", - "libvpx/vp9/encoder/vp9_mbgraph.c", "libvpx/vp9/encoder/vp9_mcomp.c", "libvpx/vp9/encoder/vp9_multi_thread.c", "libvpx/vp9/encoder/vp9_noise_estimate.c", @@ -1566,7 +997,6 @@ libvpx_mips64_c_srcs = [ "libvpx/vp9/encoder/vp9_speed_features.c", "libvpx/vp9/encoder/vp9_subexp.c", "libvpx/vp9/encoder/vp9_svc_layercontext.c", - "libvpx/vp9/encoder/vp9_temporal_filter.c", "libvpx/vp9/encoder/vp9_tokenize.c", "libvpx/vp9/encoder/vp9_treewriter.c", "libvpx/vp9/vp9_cx_iface.c", @@ -1588,6 +1018,7 @@ libvpx_mips64_c_srcs = [ "libvpx/vpx_dsp/psnr.c", "libvpx/vpx_dsp/quantize.c", "libvpx/vpx_dsp/sad.c", + "libvpx/vpx_dsp/skin_detection.c", "libvpx/vpx_dsp/subtract.c", "libvpx/vpx_dsp/sum_squares.c", "libvpx/vpx_dsp/variance.c", @@ -1600,6 +1031,7 @@ libvpx_mips64_c_srcs = [ "libvpx/vpx_scale/generic/yv12extend.c", "libvpx/vpx_scale/vpx_scale_rtcd.c", "libvpx/vpx_util/vpx_thread.c", + "libvpx/vpx_util/vpx_write_yuv_frame.c", "config/mips64/vpx_config.c", ] @@ -1631,6 +1063,7 @@ libvpx_x86_c_srcs = [ "libvpx/vp8/common/swapyv12buffer.c", "libvpx/vp8/common/treecoder.c", "libvpx/vp8/common/vp8_loopfilter.c", + "libvpx/vp8/common/vp8_skin_detection.c", "libvpx/vp8/common/x86/filter_x86.c", "libvpx/vp8/common/x86/idct_blk_mmx.c", "libvpx/vp8/common/x86/idct_blk_sse2.c", @@ -1664,10 +1097,9 @@ libvpx_x86_c_srcs = [ "libvpx/vp8/encoder/treewriter.c", "libvpx/vp8/encoder/vp8_quantize.c", "libvpx/vp8/encoder/x86/denoising_sse2.c", - "libvpx/vp8/encoder/x86/quantize_ssse3.c", - "libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c", "libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c", "libvpx/vp8/encoder/x86/vp8_quantize_sse2.c", + "libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c", "libvpx/vp8/vp8_cx_iface.c", "libvpx/vp8/vp8_dx_iface.c", "libvpx/vp9/common/vp9_alloccommon.c", @@ -1697,7 +1129,6 @@ libvpx_x86_c_srcs = [ "libvpx/vp9/decoder/vp9_decoder.c", "libvpx/vp9/decoder/vp9_detokenize.c", "libvpx/vp9/decoder/vp9_dsubexp.c", - "libvpx/vp9/decoder/vp9_dthread.c", "libvpx/vp9/encoder/vp9_alt_ref_aq.c", "libvpx/vp9/encoder/vp9_aq_360.c", "libvpx/vp9/encoder/vp9_aq_complexity.c", @@ -1713,10 +1144,8 @@ libvpx_x86_c_srcs = [ "libvpx/vp9/encoder/vp9_encoder.c", "libvpx/vp9/encoder/vp9_ethread.c", "libvpx/vp9/encoder/vp9_extend.c", - "libvpx/vp9/encoder/vp9_firstpass.c", "libvpx/vp9/encoder/vp9_frame_scale.c", "libvpx/vp9/encoder/vp9_lookahead.c", - "libvpx/vp9/encoder/vp9_mbgraph.c", "libvpx/vp9/encoder/vp9_mcomp.c", "libvpx/vp9/encoder/vp9_multi_thread.c", "libvpx/vp9/encoder/vp9_noise_estimate.c", @@ -1732,12 +1161,12 @@ libvpx_x86_c_srcs = [ "libvpx/vp9/encoder/vp9_speed_features.c", "libvpx/vp9/encoder/vp9_subexp.c", "libvpx/vp9/encoder/vp9_svc_layercontext.c", - "libvpx/vp9/encoder/vp9_temporal_filter.c", "libvpx/vp9/encoder/vp9_tokenize.c", "libvpx/vp9/encoder/vp9_treewriter.c", "libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c", "libvpx/vp9/encoder/x86/vp9_dct_ssse3.c", "libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c", + "libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c", "libvpx/vp9/encoder/x86/vp9_quantize_sse2.c", "libvpx/vp9/vp9_cx_iface.c", "libvpx/vp9/vp9_dx_iface.c", @@ -1760,6 +1189,7 @@ libvpx_x86_c_srcs = [ "libvpx/vpx_dsp/psnr.c", "libvpx/vpx_dsp/quantize.c", "libvpx/vpx_dsp/sad.c", + "libvpx/vpx_dsp/skin_detection.c", "libvpx/vpx_dsp/subtract.c", "libvpx/vpx_dsp/sum_squares.c", "libvpx/vpx_dsp/variance.c", @@ -1768,10 +1198,20 @@ libvpx_x86_c_srcs = [ "libvpx/vpx_dsp/x86/avg_intrin_sse2.c", "libvpx/vpx_dsp/x86/avg_pred_sse2.c", "libvpx/vpx_dsp/x86/fwd_txfm_sse2.c", + "libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c", + "libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c", + "libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c", + "libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c", + "libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c", + "libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c", + "libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c", + "libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c", + "libvpx/vpx_dsp/x86/highbd_variance_sse2.c", "libvpx/vpx_dsp/x86/inv_txfm_sse2.c", "libvpx/vpx_dsp/x86/inv_txfm_ssse3.c", "libvpx/vpx_dsp/x86/loopfilter_sse2.c", "libvpx/vpx_dsp/x86/quantize_sse2.c", + "libvpx/vpx_dsp/x86/quantize_ssse3.c", "libvpx/vpx_dsp/x86/sum_squares_sse2.c", "libvpx/vpx_dsp/x86/variance_sse2.c", "libvpx/vpx_dsp/x86/vpx_asm_stubs.c", @@ -1783,6 +1223,7 @@ libvpx_x86_c_srcs = [ "libvpx/vpx_scale/generic/yv12extend.c", "libvpx/vpx_scale/vpx_scale_rtcd.c", "libvpx/vpx_util/vpx_thread.c", + "libvpx/vpx_util/vpx_write_yuv_frame.c", "config/x86/vpx_config.c", ] @@ -1803,11 +1244,15 @@ libvpx_x86_asm_srcs = [ "libvpx/vp8/encoder/x86/dct_sse2.asm", "libvpx/vp8/encoder/x86/encodeopt.asm", "libvpx/vp8/encoder/x86/fwalsh_sse2.asm", - "libvpx/vp8/encoder/x86/quantize_mmx.asm", "libvpx/vp9/encoder/x86/vp9_dct_sse2.asm", "libvpx/vp9/encoder/x86/vp9_error_sse2.asm", "libvpx/vpx_dsp/x86/add_noise_sse2.asm", "libvpx/vpx_dsp/x86/deblock_sse2.asm", + "libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm", + "libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm", + "libvpx/vpx_dsp/x86/highbd_sad_sse2.asm", + "libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm", + "libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm", "libvpx/vpx_dsp/x86/intrapred_sse2.asm", "libvpx/vpx_dsp/x86/intrapred_ssse3.asm", "libvpx/vpx_dsp/x86/inv_wht_sse2.asm", @@ -1818,6 +1263,8 @@ libvpx_x86_asm_srcs = [ "libvpx/vpx_dsp/x86/subpel_variance_sse2.asm", "libvpx/vpx_dsp/x86/subtract_sse2.asm", "libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm", + "libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm", + "libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm", "libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm", "libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm", "libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm", @@ -1854,6 +1301,7 @@ libvpx_x86_64_c_srcs = [ "libvpx/vp8/common/swapyv12buffer.c", "libvpx/vp8/common/treecoder.c", "libvpx/vp8/common/vp8_loopfilter.c", + "libvpx/vp8/common/vp8_skin_detection.c", "libvpx/vp8/common/x86/filter_x86.c", "libvpx/vp8/common/x86/idct_blk_mmx.c", "libvpx/vp8/common/x86/idct_blk_sse2.c", @@ -1887,10 +1335,9 @@ libvpx_x86_64_c_srcs = [ "libvpx/vp8/encoder/treewriter.c", "libvpx/vp8/encoder/vp8_quantize.c", "libvpx/vp8/encoder/x86/denoising_sse2.c", - "libvpx/vp8/encoder/x86/quantize_ssse3.c", - "libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c", "libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c", "libvpx/vp8/encoder/x86/vp8_quantize_sse2.c", + "libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c", "libvpx/vp8/vp8_cx_iface.c", "libvpx/vp8/vp8_dx_iface.c", "libvpx/vp9/common/vp9_alloccommon.c", @@ -1920,7 +1367,6 @@ libvpx_x86_64_c_srcs = [ "libvpx/vp9/decoder/vp9_decoder.c", "libvpx/vp9/decoder/vp9_detokenize.c", "libvpx/vp9/decoder/vp9_dsubexp.c", - "libvpx/vp9/decoder/vp9_dthread.c", "libvpx/vp9/encoder/vp9_alt_ref_aq.c", "libvpx/vp9/encoder/vp9_aq_360.c", "libvpx/vp9/encoder/vp9_aq_complexity.c", @@ -1936,10 +1382,8 @@ libvpx_x86_64_c_srcs = [ "libvpx/vp9/encoder/vp9_encoder.c", "libvpx/vp9/encoder/vp9_ethread.c", "libvpx/vp9/encoder/vp9_extend.c", - "libvpx/vp9/encoder/vp9_firstpass.c", "libvpx/vp9/encoder/vp9_frame_scale.c", "libvpx/vp9/encoder/vp9_lookahead.c", - "libvpx/vp9/encoder/vp9_mbgraph.c", "libvpx/vp9/encoder/vp9_mcomp.c", "libvpx/vp9/encoder/vp9_multi_thread.c", "libvpx/vp9/encoder/vp9_noise_estimate.c", @@ -1955,12 +1399,12 @@ libvpx_x86_64_c_srcs = [ "libvpx/vp9/encoder/vp9_speed_features.c", "libvpx/vp9/encoder/vp9_subexp.c", "libvpx/vp9/encoder/vp9_svc_layercontext.c", - "libvpx/vp9/encoder/vp9_temporal_filter.c", "libvpx/vp9/encoder/vp9_tokenize.c", "libvpx/vp9/encoder/vp9_treewriter.c", "libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c", "libvpx/vp9/encoder/x86/vp9_dct_ssse3.c", "libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c", + "libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c", "libvpx/vp9/encoder/x86/vp9_quantize_sse2.c", "libvpx/vp9/vp9_cx_iface.c", "libvpx/vp9/vp9_dx_iface.c", @@ -1983,6 +1427,7 @@ libvpx_x86_64_c_srcs = [ "libvpx/vpx_dsp/psnr.c", "libvpx/vpx_dsp/quantize.c", "libvpx/vpx_dsp/sad.c", + "libvpx/vpx_dsp/skin_detection.c", "libvpx/vpx_dsp/subtract.c", "libvpx/vpx_dsp/sum_squares.c", "libvpx/vpx_dsp/variance.c", @@ -1991,10 +1436,20 @@ libvpx_x86_64_c_srcs = [ "libvpx/vpx_dsp/x86/avg_intrin_sse2.c", "libvpx/vpx_dsp/x86/avg_pred_sse2.c", "libvpx/vpx_dsp/x86/fwd_txfm_sse2.c", + "libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c", + "libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c", + "libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c", + "libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c", + "libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c", + "libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c", + "libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c", + "libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c", + "libvpx/vpx_dsp/x86/highbd_variance_sse2.c", "libvpx/vpx_dsp/x86/inv_txfm_sse2.c", "libvpx/vpx_dsp/x86/inv_txfm_ssse3.c", "libvpx/vpx_dsp/x86/loopfilter_sse2.c", "libvpx/vpx_dsp/x86/quantize_sse2.c", + "libvpx/vpx_dsp/x86/quantize_ssse3.c", "libvpx/vpx_dsp/x86/sum_squares_sse2.c", "libvpx/vpx_dsp/x86/variance_sse2.c", "libvpx/vpx_dsp/x86/vpx_asm_stubs.c", @@ -2006,6 +1461,7 @@ libvpx_x86_64_c_srcs = [ "libvpx/vpx_scale/generic/yv12extend.c", "libvpx/vpx_scale/vpx_scale_rtcd.c", "libvpx/vpx_util/vpx_thread.c", + "libvpx/vpx_util/vpx_write_yuv_frame.c", "config/x86_64/vpx_config.c", ] @@ -2027,7 +1483,6 @@ libvpx_x86_64_asm_srcs = [ "libvpx/vp8/encoder/x86/dct_sse2.asm", "libvpx/vp8/encoder/x86/encodeopt.asm", "libvpx/vp8/encoder/x86/fwalsh_sse2.asm", - "libvpx/vp8/encoder/x86/quantize_mmx.asm", "libvpx/vp9/encoder/x86/vp9_dct_sse2.asm", "libvpx/vp9/encoder/x86/vp9_error_sse2.asm", "libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm", @@ -2035,10 +1490,14 @@ libvpx_x86_64_asm_srcs = [ "libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm", "libvpx/vpx_dsp/x86/deblock_sse2.asm", "libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm", + "libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm", + "libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm", + "libvpx/vpx_dsp/x86/highbd_sad_sse2.asm", + "libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm", + "libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm", "libvpx/vpx_dsp/x86/intrapred_sse2.asm", "libvpx/vpx_dsp/x86/intrapred_ssse3.asm", "libvpx/vpx_dsp/x86/inv_wht_sse2.asm", - "libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm", "libvpx/vpx_dsp/x86/sad4d_sse2.asm", "libvpx/vpx_dsp/x86/sad_sse2.asm", "libvpx/vpx_dsp/x86/sad_sse3.asm", @@ -2047,6 +1506,8 @@ libvpx_x86_64_asm_srcs = [ "libvpx/vpx_dsp/x86/subpel_variance_sse2.asm", "libvpx/vpx_dsp/x86/subtract_sse2.asm", "libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm", + "libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm", + "libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm", "libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm", "libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm", "libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm", @@ -2086,7 +1547,7 @@ cc_library_static { local_include_dirs: ["config/arm"], neon: { - exclude_srcs: libvpx_arm_c_srcs, + exclude_srcs: libvpx_arm_neon_exclude_c_srcs, srcs: libvpx_arm_neon_c_srcs, generated_sources: ["libvpx_arm_neon_asm_srcs_converted"], local_include_dirs: ["config/arm-neon"], @@ -2103,13 +1564,13 @@ cc_library_static { local_include_dirs: ["config/mips32"], dspr2: { - exclude_srcs: libvpx_mips32_c_srcs, + exclude_srcs: libvpx_mips32_dspr2_exclude_c_srcs, srcs: libvpx_mips32_dspr2_c_srcs, local_include_dirs: ["config/mips32-dspr2"], }, msa: { - exclude_srcs: libvpx_mips32_c_srcs, + exclude_srcs: libvpx_mips32_msa_exclude_c_srcs, srcs: libvpx_mips32_msa_c_srcs, local_include_dirs: ["config/mips32-msa"], }, @@ -2120,7 +1581,7 @@ cc_library_static { local_include_dirs: ["config/mips64"], msa: { - exclude_srcs: libvpx_mips64_c_srcs, + exclude_srcs: libvpx_mips64_msa_exclude_c_srcs, srcs: libvpx_mips64_msa_c_srcs, local_include_dirs: ["config/mips64-msa"], }, diff --git a/Android.bp.in b/Android.bp.in index ac6a46435..0fb7c9581 100644 --- a/Android.bp.in +++ b/Android.bp.in @@ -15,6 +15,7 @@ gensrcs { cc_library_static { name: "libvpx", + vendor_available: true, arch: { arm: { @@ -28,7 +29,7 @@ cc_library_static { local_include_dirs: ["config/arm"], neon: { - exclude_srcs: libvpx_arm_c_srcs, + exclude_srcs: libvpx_arm_neon_exclude_c_srcs, srcs: libvpx_arm_neon_c_srcs, generated_sources: ["libvpx_arm_neon_asm_srcs_converted"], local_include_dirs: ["config/arm-neon"], @@ -45,13 +46,13 @@ cc_library_static { local_include_dirs: ["config/mips32"], dspr2: { - exclude_srcs: libvpx_mips32_c_srcs, + exclude_srcs: libvpx_mips32_dspr2_exclude_c_srcs, srcs: libvpx_mips32_dspr2_c_srcs, local_include_dirs: ["config/mips32-dspr2"], }, msa: { - exclude_srcs: libvpx_mips32_c_srcs, + exclude_srcs: libvpx_mips32_msa_exclude_c_srcs, srcs: libvpx_mips32_msa_c_srcs, local_include_dirs: ["config/mips32-msa"], }, @@ -62,7 +63,7 @@ cc_library_static { local_include_dirs: ["config/mips64"], msa: { - exclude_srcs: libvpx_mips64_c_srcs, + exclude_srcs: libvpx_mips64_msa_exclude_c_srcs, srcs: libvpx_mips64_msa_c_srcs, local_include_dirs: ["config/mips64-msa"], }, diff --git a/CleanSpec.mk b/CleanSpec.mk new file mode 100644 index 000000000..cac3d3bc5 --- /dev/null +++ b/CleanSpec.mk @@ -0,0 +1,53 @@ +# Copyright (C) 2017 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# If you don't need to do a full clean build but would like to touch +# a file or delete some intermediate files, add a clean step to the end +# of the list. These steps will only be run once, if they haven't been +# run before. +# +# E.g.: +# $(call add-clean-step, touch -c external/sqlite/sqlite3.h) +# $(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/STATIC_LIBRARIES/libz_intermediates) +# +# Always use "touch -c" and "rm -f" or "rm -rf" to gracefully deal with +# files that are missing or have been moved. +# +# Use $(PRODUCT_OUT) to get to the "out/target/product/blah/" directory. +# Use $(OUT_DIR) to refer to the "out" directory. +# +# If you need to re-do something that's already mentioned, just copy +# the command and add it to the bottom of the list. E.g., if a change +# that you made last week required touching a file and a change you +# made today requires touching the same file, just copy the old +# touch step and add it to the end of the list. +# +# ************************************************ +# NEWER CLEAN STEPS MUST BE AT THE END OF THE LIST +# ************************************************ + +# For example: +#$(call add-clean-step, rm -rf $(OUT_DIR)/target/common/obj/APPS/AndroidTests_intermediates) +#$(call add-clean-step, rm -rf $(OUT_DIR)/target/common/obj/JAVA_LIBRARIES/core_intermediates) +#$(call add-clean-step, find $(OUT_DIR) -type f -name "IGTalkSession*" -print0 | xargs -0 rm -f) +#$(call add-clean-step, rm -rf $(PRODUCT_OUT)/data/*) + +# ************************************************ +# NEWER CLEAN STEPS MUST BE AT THE END OF THE LIST +# ************************************************ + +# vpx_config.asm change +$(call add-clean-step, rm -rf $(OUT_DIR)/soong/.intermediates/external/libvpx/libvpx) + diff --git a/README.android b/README.android index 92a84980b..92739cd7b 100644 --- a/README.android +++ b/README.android @@ -1,12 +1,12 @@ Name: libvpx URL: http://www.webmproject.org -Version: v1.6.1-665-gbcfd9c975 +Version: v1.7.0 License: BSD License File: libvpx/LICENSE -Date: Tuesday May 23 2017 -Branch: origin/master -Commit: bcfd9c97508531a81cc2f5d393edb9eb1b00ce79 +Date: Wednesday January 24 2018 +Branch: origin/mandarinduck +Commit: f80be22a1099b2a431c2796f529bb261064ec6b4 Description: Contains the sources used to compile libvpx. diff --git a/README.version b/README.version index 07913c812..c6c6a3724 100644 --- a/README.version +++ b/README.version @@ -1,4 +1,6 @@ -URL: https://chromium.googlesource.com/webm/libvpx.git/+archive/bcfd9c97508531a81cc2f5d393edb9eb1b00ce79.tar.gz -Version: v1.6.1-665-gbcfd9c975 +URL: https://chromium.googlesource.com/webm/libvpx.git/+archive/v1.7.0.tar.gz +Version: v1.7.0 BugComponent: 42195 Owners: johannkoenig +Local Modifications: + Add visibility="protected" attribute for global variables referenced in asm files. diff --git a/config/arm-neon/vp8_rtcd.h b/config/arm-neon/vp8_rtcd.h index 3f112f6f7..4eb59c663 100644 --- a/config/arm-neon/vp8_rtcd.h +++ b/config/arm-neon/vp8_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP8_RTCD_H_ #define VP8_RTCD_H_ diff --git a/config/arm-neon/vp9_rtcd.h b/config/arm-neon/vp9_rtcd.h index 1df16205a..0f4f04d1f 100644 --- a/config/arm-neon/vp9_rtcd.h +++ b/config/arm-neon/vp9_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP9_RTCD_H_ #define VP9_RTCD_H_ @@ -33,9 +34,8 @@ extern "C" { int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_c -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); -int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, int block_size); -#define vp9_block_error_fp vp9_block_error_fp_neon +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +#define vp9_block_error_fp vp9_block_error_fp_c int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); #define vp9_diamond_search_sad vp9_diamond_search_sad_c @@ -53,35 +53,62 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); #define vp9_fht8x8 vp9_fht8x8_c -int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -#define vp9_full_search_sad vp9_full_search_sad_c - void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_c +int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd); +#define vp9_highbd_block_error vp9_highbd_block_error_c + +void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c + +void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c + +void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c + +void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c + +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd); +#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c + +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c + +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c + +void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c + +void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c + +void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c + void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp vp9_quantize_fp_neon void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c +void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_neon void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); -#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c - -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); -#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c +void vp9_scale_and_extend_frame_neon(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_neon void vp9_rtcd(void); diff --git a/config/arm-neon/vpx_config.asm b/config/arm-neon/vpx_config.asm index fdeb46a67..0a0b1d240 100644 --- a/config/arm-neon/vpx_config.asm +++ b/config/arm-neon/vpx_config.asm @@ -20,7 +20,9 @@ .equ HAVE_SSE4_1 , 0 .equ HAVE_AVX , 0 .equ HAVE_AVX2 , 0 +.equ HAVE_AVX512 , 0 .equ HAVE_VSX , 0 +.equ HAVE_MMI , 0 .equ HAVE_VPX_PORTS , 1 .equ HAVE_PTHREAD_H , 1 .equ HAVE_UNISTD_H , 1 @@ -74,10 +76,11 @@ .equ CONFIG_TEMPORAL_DENOISING , 1 .equ CONFIG_VP9_TEMPORAL_DENOISING , 0 .equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 -.equ CONFIG_VP9_HIGHBITDEPTH , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 1 .equ CONFIG_BETTER_HW_COMPATIBILITY , 0 .equ CONFIG_EXPERIMENTAL , 0 .equ CONFIG_SIZE_LIMIT , 1 +.equ CONFIG_ALWAYS_ADJUST_BPM , 0 .equ CONFIG_SPATIAL_SVC , 0 .equ CONFIG_FP_MB_STATS , 0 .equ CONFIG_EMULATE_HARDWARE , 0 diff --git a/config/arm-neon/vpx_config.c b/config/arm-neon/vpx_config.c index 0eb0a305c..95e12998c 100644 --- a/config/arm-neon/vpx_config.c +++ b/config/arm-neon/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=armv7-linux-gcc --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072"; +static const char* const cfg = "--target=armv7-linux-gcc --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/config/arm-neon/vpx_config.h b/config/arm-neon/vpx_config.h index d632a2191..e9d645653 100644 --- a/config/arm-neon/vpx_config.h +++ b/config/arm-neon/vpx_config.h @@ -29,7 +29,9 @@ #define HAVE_SSE4_1 0 #define HAVE_AVX 0 #define HAVE_AVX2 0 +#define HAVE_AVX512 0 #define HAVE_VSX 0 +#define HAVE_MMI 0 #define HAVE_VPX_PORTS 1 #define HAVE_PTHREAD_H 1 #define HAVE_UNISTD_H 1 @@ -83,10 +85,11 @@ #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 -#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_VP9_HIGHBITDEPTH 1 #define CONFIG_BETTER_HW_COMPATIBILITY 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 #define CONFIG_SPATIAL_SVC 0 #define CONFIG_FP_MB_STATS 0 #define CONFIG_EMULATE_HARDWARE 0 diff --git a/config/arm-neon/vpx_dsp_rtcd.h b/config/arm-neon/vpx_dsp_rtcd.h index a915afabf..d911fd37f 100644 --- a/config/arm-neon/vpx_dsp_rtcd.h +++ b/config/arm-neon/vpx_dsp_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_DSP_RTCD_H_ #define VPX_DSP_RTCD_H_ @@ -13,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #ifdef __cplusplus @@ -28,38 +30,39 @@ unsigned int vpx_avg_8x8_neon(const uint8_t *, int p); #define vpx_avg_8x8 vpx_avg_8x8_neon void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -#define vpx_comp_avg_pred vpx_comp_avg_pred_c +void vpx_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_neon -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8 vpx_convolve8_neon -void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg vpx_convolve8_avg_neon -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_neon -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_neon -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_horiz vpx_convolve8_horiz_neon -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_vert vpx_convolve8_vert_neon -void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_avg vpx_convolve_avg_neon -void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_copy vpx_convolve_copy_neon void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -213,26 +216,32 @@ void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8 #define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_neon void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct16x16 vpx_fdct16x16_c +void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16 vpx_fdct16x16_neon void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct16x16_1 vpx_fdct16x16_1_c +void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16_1 vpx_fdct16x16_1_neon void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct32x32 vpx_fdct32x32_c +void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32 vpx_fdct32x32_neon void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct32x32_1 vpx_fdct32x32_1_c +void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_1 vpx_fdct32x32_1_neon void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c +void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_rd vpx_fdct32x32_rd_neon void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride); #define vpx_fdct4x4 vpx_fdct4x4_neon void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct4x4_1 vpx_fdct4x4_1_c +void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4_1 vpx_fdct4x4_1_neon void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride); @@ -273,17 +282,915 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_neon -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); #define vpx_hadamard_16x16 vpx_hadamard_16x16_neon -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); #define vpx_hadamard_8x8 vpx_hadamard_8x8_neon void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c +void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c + +void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c + +unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c + +unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c + +unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c + +unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c + +unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c + +unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c + +unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c + +unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c + +unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c + +unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c + +unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c + +unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c + +unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c + +unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c + +unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c + +unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c + +void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c + +void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c + +unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c + +unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c + +unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c + +unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c + +unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c + +unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c + +unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c + +unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c + +unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c + +unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c + +unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c + +unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c + +unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c + +unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c + +unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c + +unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c + +void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c + +void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c + +unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c + +unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c + +unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c + +unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c + +unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c + +unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c + +unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c + +unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c + +unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c + +unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c + +unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c + +unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c + +unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c + +unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c + +unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c + +unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c + +unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p); +#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c + +unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p); +#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c + +void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride); +#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c + +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8 vpx_highbd_convolve8_neon + +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_neon + +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_neon + +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_neon + +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_neon + +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_neon + +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_neon + +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_neon + +void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c + +void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c + +void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c + +void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c + +void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_neon + +void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_neon + +void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_neon + +void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_neon + +void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c + +void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c + +void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c + +void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c + +void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c + +void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c + +void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c + +void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c + +void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_neon + +void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_neon + +void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_neon + +void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_neon + +void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c + +void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c + +void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c + +void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c + +void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_neon + +void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_neon + +void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_neon + +void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_neon + +void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_neon + +void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_neon + +void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_neon + +void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_neon + +void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_neon + +void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_neon + +void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_neon + +void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_neon + +void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_neon + +void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_neon + +void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_neon + +void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_neon + +void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c + +void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c + +void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c + +void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c + +void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c + +void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c + +void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c + +void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8_1 vpx_fdct8x8_1_neon + +void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_neon + +void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_neon + +void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_neon + +void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_neon + +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_neon + +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_neon + +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_neon + +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_neon + +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_neon + +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_neon + +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_neon + +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_neon + +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_neon + +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_neon + +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_neon + +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_neon + +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_neon + +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c + +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c + +void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_neon + +void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_horizontal_16_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_neon + +void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_neon + +void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +void vpx_highbd_lpf_horizontal_4_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_neon + +void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_neon + +void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +void vpx_highbd_lpf_horizontal_8_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_neon + +void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_vertical_16_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_neon + +void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_vertical_16_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_neon + +void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_neon + +void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +void vpx_highbd_lpf_vertical_4_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_neon + +void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_neon + +void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +void vpx_highbd_lpf_vertical_8_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_neon + +void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c + +void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c + +void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c + +unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c + +unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c + +void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c + +unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c + +unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c + +void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c + +unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c + +unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c + +void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c + +unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c + +unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c + +void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c + +unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c + +unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c + +void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c + +unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c + +unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c + +void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c + +unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c + +unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c + +void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c + +unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c + +unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c + +void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c + +unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c + +unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c + +void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c + +unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c + +unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c + +void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c + +unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c + +unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c + +void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c + +unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c + +unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c + +void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c + +unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c + +unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c + +void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c + +void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd); +#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c + +void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_neon + +void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_neon + +void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_neon + +void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_neon + +void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_neon + +void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_neon + +void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_neon + +void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_neon + void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon @@ -416,17 +1323,20 @@ unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint #define vpx_mse8x8 vpx_mse8x8_c void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vpx_quantize_b vpx_quantize_b_c +void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_quantize_b vpx_quantize_b_neon void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c +void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_neon unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad16x16 vpx_sad16x16_neon unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad16x16_avg vpx_sad16x16_avg_c +unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x16_avg vpx_sad16x16_avg_neon void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad16x16x3 vpx_sad16x16x3_c @@ -439,223 +1349,247 @@ void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref #define vpx_sad16x16x8 vpx_sad16x16x8_c unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_sad16x32 vpx_sad16x32_c +unsigned int vpx_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x32 vpx_sad16x32_neon unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad16x32_avg vpx_sad16x32_avg_c +unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x32_avg vpx_sad16x32_avg_neon void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad16x32x4d vpx_sad16x32x4d_c +void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad16x32x4d vpx_sad16x32x4d_neon unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad16x8 vpx_sad16x8_neon unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad16x8_avg vpx_sad16x8_avg_c +unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x8_avg vpx_sad16x8_avg_neon void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad16x8x3 vpx_sad16x8x3_c void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad16x8x4d vpx_sad16x8x4d_c +void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad16x8x4d vpx_sad16x8x4d_neon void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad16x8x8 vpx_sad16x8x8_c unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_sad32x16 vpx_sad32x16_c +unsigned int vpx_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x16 vpx_sad32x16_neon unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad32x16_avg vpx_sad32x16_avg_c +unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x16_avg vpx_sad32x16_avg_neon void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad32x16x4d vpx_sad32x16x4d_c +void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad32x16x4d vpx_sad32x16x4d_neon unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad32x32 vpx_sad32x32_neon unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad32x32_avg vpx_sad32x32_avg_c - -void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x3 vpx_sad32x32x3_c +unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x32_avg vpx_sad32x32_avg_neon void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad32x32x4d vpx_sad32x32x4d_neon -void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x8 vpx_sad32x32x8_c - unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_sad32x64 vpx_sad32x64_c +unsigned int vpx_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x64 vpx_sad32x64_neon unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad32x64_avg vpx_sad32x64_avg_c +unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x64_avg vpx_sad32x64_avg_neon void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad32x64x4d vpx_sad32x64x4d_c +void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad32x64x4d vpx_sad32x64x4d_neon unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad4x4 vpx_sad4x4_neon unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad4x4_avg vpx_sad4x4_avg_c +unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x4_avg vpx_sad4x4_avg_neon void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad4x4x3 vpx_sad4x4x3_c void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad4x4x4d vpx_sad4x4x4d_c +void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad4x4x4d vpx_sad4x4x4d_neon void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad4x4x8 vpx_sad4x4x8_c unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_sad4x8 vpx_sad4x8_c +unsigned int vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x8 vpx_sad4x8_neon unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad4x8_avg vpx_sad4x8_avg_c +unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x8_avg vpx_sad4x8_avg_neon void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x4d vpx_sad4x8x4d_c - -void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x8 vpx_sad4x8x8_c +void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad4x8x4d vpx_sad4x8x4d_neon unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_sad64x32 vpx_sad64x32_c +unsigned int vpx_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad64x32 vpx_sad64x32_neon unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad64x32_avg vpx_sad64x32_avg_c +unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad64x32_avg vpx_sad64x32_avg_neon void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad64x32x4d vpx_sad64x32x4d_c +void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad64x32x4d vpx_sad64x32x4d_neon unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad64x64 vpx_sad64x64_neon unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad64x64_avg vpx_sad64x64_avg_c - -void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x3 vpx_sad64x64x3_c +unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad64x64_avg vpx_sad64x64_avg_neon void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad64x64x4d vpx_sad64x64x4d_neon -void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x8 vpx_sad64x64x8_c - unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x16 vpx_sad8x16_neon unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad8x16_avg vpx_sad8x16_avg_c +unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x16_avg vpx_sad8x16_avg_neon void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x16x3 vpx_sad8x16x3_c void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad8x16x4d vpx_sad8x16x4d_c +void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad8x16x4d vpx_sad8x16x4d_neon void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x16x8 vpx_sad8x16x8_c unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_sad8x4 vpx_sad8x4_c +unsigned int vpx_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x4 vpx_sad8x4_neon unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad8x4_avg vpx_sad8x4_avg_c +unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x4_avg vpx_sad8x4_avg_neon void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x4d vpx_sad8x4x4d_c - -void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x8 vpx_sad8x4x8_c +void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad8x4x4d vpx_sad8x4x4d_neon unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x8 vpx_sad8x8_neon unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad8x8_avg vpx_sad8x8_avg_c +unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x8_avg vpx_sad8x8_avg_neon void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x3 vpx_sad8x8x3_c void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad8x8x4d vpx_sad8x8x4d_c +void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad8x8x4d vpx_sad8x8x4d_neon void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x8 vpx_sad8x8x8_c -int vpx_satd_c(const int16_t *coeff, int length); -int vpx_satd_neon(const int16_t *coeff, int length); +int vpx_satd_c(const tran_low_t *coeff, int length); +int vpx_satd_neon(const tran_low_t *coeff, int length); #define vpx_satd vpx_satd_neon -void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vpx_scaled_2d vpx_scaled_2d_c +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_2d vpx_scaled_2d_neon -void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c -void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c -void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c -void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_horiz vpx_scaled_horiz_c -void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c +uint32_t vpx_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_neon uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c +uint32_t vpx_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_neon uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c +uint32_t vpx_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_neon uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c +uint32_t vpx_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_neon uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c +uint32_t vpx_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_neon uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c +uint32_t vpx_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_neon uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c +uint32_t vpx_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_neon uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c +uint32_t vpx_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_neon uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c +uint32_t vpx_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_neon uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c +uint32_t vpx_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_neon uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c +uint32_t vpx_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_neon uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c +uint32_t vpx_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_neon uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c +uint32_t vpx_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_neon uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); @@ -682,10 +1616,12 @@ uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_str #define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_neon uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c +uint32_t vpx_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_neon uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c +uint32_t vpx_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_neon uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); diff --git a/config/arm-neon/vpx_scale_rtcd.h b/config/arm-neon/vpx_scale_rtcd.h index a1564b7ad..b37136827 100644 --- a/config/arm-neon/vpx_scale_rtcd.h +++ b/config/arm-neon/vpx_scale_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_SCALE_RTCD_H_ #define VPX_SCALE_RTCD_H_ @@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vpx_yv12_copy_y vpx_yv12_copy_y_c diff --git a/config/arm-neon/vpx_version.h b/config/arm-neon/vpx_version.h index 24da169b4..6078bae90 100644 --- a/config/arm-neon/vpx_version.h +++ b/config/arm-neon/vpx_version.h @@ -1,7 +1,8 @@ +// This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 6 -#define VERSION_PATCH 1 +#define VERSION_MINOR 7 +#define VERSION_PATCH 0 #define VERSION_EXTRA "" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.6.1" -#define VERSION_STRING " v1.6.1" +#define VERSION_STRING_NOSP "v1.7.0" +#define VERSION_STRING " v1.7.0" diff --git a/config/arm/vp8_rtcd.h b/config/arm/vp8_rtcd.h index e089d058d..188b1d7a2 100644 --- a/config/arm/vp8_rtcd.h +++ b/config/arm/vp8_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP8_RTCD_H_ #define VP8_RTCD_H_ diff --git a/config/arm/vp9_rtcd.h b/config/arm/vp9_rtcd.h index 6d67ad8bc..8cb5870c0 100644 --- a/config/arm/vp9_rtcd.h +++ b/config/arm/vp9_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP9_RTCD_H_ #define VP9_RTCD_H_ @@ -33,7 +34,7 @@ extern "C" { int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_c -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); #define vp9_block_error_fp vp9_block_error_fp_c int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); @@ -51,12 +52,42 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); #define vp9_fht8x8 vp9_fht8x8_c -int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -#define vp9_full_search_sad vp9_full_search_sad_c - void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_c +int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd); +#define vp9_highbd_block_error vp9_highbd_block_error_c + +void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c + +void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c + +void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c + +void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c + +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd); +#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c + +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c + +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c + +void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c + +void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c + +void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c + void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c @@ -75,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); -#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c - void vp9_rtcd(void); #include "vpx_config.h" diff --git a/config/arm/vpx_config.asm b/config/arm/vpx_config.asm index 022dfa9b8..53f2e8535 100644 --- a/config/arm/vpx_config.asm +++ b/config/arm/vpx_config.asm @@ -20,7 +20,9 @@ .equ HAVE_SSE4_1 , 0 .equ HAVE_AVX , 0 .equ HAVE_AVX2 , 0 +.equ HAVE_AVX512 , 0 .equ HAVE_VSX , 0 +.equ HAVE_MMI , 0 .equ HAVE_VPX_PORTS , 1 .equ HAVE_PTHREAD_H , 1 .equ HAVE_UNISTD_H , 1 @@ -74,10 +76,11 @@ .equ CONFIG_TEMPORAL_DENOISING , 1 .equ CONFIG_VP9_TEMPORAL_DENOISING , 0 .equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 -.equ CONFIG_VP9_HIGHBITDEPTH , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 1 .equ CONFIG_BETTER_HW_COMPATIBILITY , 0 .equ CONFIG_EXPERIMENTAL , 0 .equ CONFIG_SIZE_LIMIT , 1 +.equ CONFIG_ALWAYS_ADJUST_BPM , 0 .equ CONFIG_SPATIAL_SVC , 0 .equ CONFIG_FP_MB_STATS , 0 .equ CONFIG_EMULATE_HARDWARE , 0 diff --git a/config/arm/vpx_config.c b/config/arm/vpx_config.c index 7bc1805f6..1bc63e4f0 100644 --- a/config/arm/vpx_config.c +++ b/config/arm/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=armv7-linux-gcc --disable-neon --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072"; +static const char* const cfg = "--target=armv7-linux-gcc --disable-neon --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/config/arm/vpx_config.h b/config/arm/vpx_config.h index ddd914e4b..039717798 100644 --- a/config/arm/vpx_config.h +++ b/config/arm/vpx_config.h @@ -29,7 +29,9 @@ #define HAVE_SSE4_1 0 #define HAVE_AVX 0 #define HAVE_AVX2 0 +#define HAVE_AVX512 0 #define HAVE_VSX 0 +#define HAVE_MMI 0 #define HAVE_VPX_PORTS 1 #define HAVE_PTHREAD_H 1 #define HAVE_UNISTD_H 1 @@ -83,10 +85,11 @@ #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 -#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_VP9_HIGHBITDEPTH 1 #define CONFIG_BETTER_HW_COMPATIBILITY 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 #define CONFIG_SPATIAL_SVC 0 #define CONFIG_FP_MB_STATS 0 #define CONFIG_EMULATE_HARDWARE 0 diff --git a/config/arm/vpx_dsp_rtcd.h b/config/arm/vpx_dsp_rtcd.h index 51b423f20..25ee2a9dd 100644 --- a/config/arm/vpx_dsp_rtcd.h +++ b/config/arm/vpx_dsp_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_DSP_RTCD_H_ #define VPX_DSP_RTCD_H_ @@ -13,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #ifdef __cplusplus @@ -28,28 +30,28 @@ unsigned int vpx_avg_8x8_c(const uint8_t *, int p); void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); #define vpx_comp_avg_pred vpx_comp_avg_pred_c -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8 vpx_convolve8_c -void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg vpx_convolve8_avg_c -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_horiz vpx_convolve8_horiz_c -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_vert vpx_convolve8_vert_c -void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_avg vpx_convolve_avg_c -void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_copy vpx_convolve_copy_c void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -229,15 +231,843 @@ void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); #define vpx_hadamard_16x16 vpx_hadamard_16x16_c -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); #define vpx_hadamard_8x8 vpx_hadamard_8x8_c void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c +void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c + +void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c + +unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c + +unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c + +unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c + +unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c + +unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c + +unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c + +unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c + +unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c + +unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c + +unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c + +unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c + +unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c + +unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c + +unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c + +unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c + +unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c + +void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c + +void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c + +unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c + +unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c + +unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c + +unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c + +unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c + +unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c + +unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c + +unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c + +unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c + +unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c + +unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c + +unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c + +unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c + +unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c + +unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c + +unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c + +void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c + +void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c + +unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c + +unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c + +unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c + +unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c + +unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c + +unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c + +unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c + +unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c + +unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c + +unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c + +unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c + +unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c + +unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c + +unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c + +unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c + +unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c + +unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p); +#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c + +unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p); +#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c + +void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride); +#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c + +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8 vpx_highbd_convolve8_c + +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c + +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c + +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c + +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c + +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c + +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c + +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c + +void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c + +void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c + +void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c + +void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c + +void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c + +void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c + +void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c + +void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c + +void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c + +void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c + +void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c + +void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c + +void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c + +void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c + +void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c + +void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c + +void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c + +void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c + +void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c + +void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c + +void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c + +void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c + +void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c + +void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c + +void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c + +void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c + +void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c + +void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c + +void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c + +void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c + +void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c + +void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c + +void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c + +void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c + +void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c + +void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c + +void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c + +void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c + +void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c + +void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c + +void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c + +void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c + +void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c + +void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c + +void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c + +void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c + +void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c + +void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c + +void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c + +void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c + +void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c + +void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c + +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c + +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c + +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c + +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c + +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c + +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c + +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c + +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c + +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c + +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c + +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c + +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c + +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c + +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c + +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c + +void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c + +void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c + +void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c + +void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c + +void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c + +void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c + +void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c + +void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c + +void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c + +void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c + +void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c + +void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c + +void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c + +void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c + +void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c + +unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c + +unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c + +void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c + +unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c + +unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c + +void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c + +unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c + +unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c + +void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c + +unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c + +unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c + +void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c + +unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c + +unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c + +void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c + +unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c + +unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c + +void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c + +unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c + +unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c + +void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c + +unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c + +unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c + +void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c + +unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c + +unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c + +void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c + +unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c + +unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c + +void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c + +unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c + +unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c + +void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c + +unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c + +unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c + +void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c + +unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c + +unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c + +void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c + +void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd); +#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c + +void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c + +void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c + +void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c + +void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c + +void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c + +void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c + +void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c + +void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c + void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c @@ -400,15 +1230,9 @@ unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x32_avg vpx_sad32x32_avg_c -void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x3 vpx_sad32x32x3_c - void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad32x32x4d vpx_sad32x32x4d_c -void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x8 vpx_sad32x32x8_c - unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad32x64 vpx_sad32x64_c @@ -442,9 +1266,6 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad4x8x4d vpx_sad4x8x4d_c -void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x8 vpx_sad4x8x8_c - unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad64x32 vpx_sad64x32_c @@ -460,15 +1281,9 @@ unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x64_avg vpx_sad64x64_avg_c -void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x3 vpx_sad64x64x3_c - void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad64x64x4d vpx_sad64x64x4d_c -void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x8 vpx_sad64x64x8_c - unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x16 vpx_sad8x16_c @@ -493,9 +1308,6 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad8x4x4d vpx_sad8x4x4d_c -void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x8 vpx_sad8x4x8_c - unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x8 vpx_sad8x8_c @@ -511,25 +1323,25 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x8 vpx_sad8x8x8_c -int vpx_satd_c(const int16_t *coeff, int length); +int vpx_satd_c(const tran_low_t *coeff, int length); #define vpx_satd vpx_satd_c -void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_2d vpx_scaled_2d_c -void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c -void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c -void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c -void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_horiz vpx_scaled_horiz_c -void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); diff --git a/config/arm/vpx_scale_rtcd.h b/config/arm/vpx_scale_rtcd.h index a1564b7ad..b37136827 100644 --- a/config/arm/vpx_scale_rtcd.h +++ b/config/arm/vpx_scale_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_SCALE_RTCD_H_ #define VPX_SCALE_RTCD_H_ @@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vpx_yv12_copy_y vpx_yv12_copy_y_c diff --git a/config/arm/vpx_version.h b/config/arm/vpx_version.h index 24da169b4..6078bae90 100644 --- a/config/arm/vpx_version.h +++ b/config/arm/vpx_version.h @@ -1,7 +1,8 @@ +// This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 6 -#define VERSION_PATCH 1 +#define VERSION_MINOR 7 +#define VERSION_PATCH 0 #define VERSION_EXTRA "" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.6.1" -#define VERSION_STRING " v1.6.1" +#define VERSION_STRING_NOSP "v1.7.0" +#define VERSION_STRING " v1.7.0" diff --git a/config/arm64/vp8_rtcd.h b/config/arm64/vp8_rtcd.h index 3f112f6f7..4eb59c663 100644 --- a/config/arm64/vp8_rtcd.h +++ b/config/arm64/vp8_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP8_RTCD_H_ #define VP8_RTCD_H_ diff --git a/config/arm64/vp9_rtcd.h b/config/arm64/vp9_rtcd.h index 1df16205a..0f4f04d1f 100644 --- a/config/arm64/vp9_rtcd.h +++ b/config/arm64/vp9_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP9_RTCD_H_ #define VP9_RTCD_H_ @@ -33,9 +34,8 @@ extern "C" { int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_c -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); -int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, int block_size); -#define vp9_block_error_fp vp9_block_error_fp_neon +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +#define vp9_block_error_fp vp9_block_error_fp_c int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); #define vp9_diamond_search_sad vp9_diamond_search_sad_c @@ -53,35 +53,62 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); #define vp9_fht8x8 vp9_fht8x8_c -int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -#define vp9_full_search_sad vp9_full_search_sad_c - void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_c +int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd); +#define vp9_highbd_block_error vp9_highbd_block_error_c + +void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c + +void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c + +void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c + +void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c + +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd); +#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c + +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c + +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c + +void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c + +void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c + +void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c + void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp vp9_quantize_fp_neon void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c +void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_neon void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); -#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c - -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); -#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c +void vp9_scale_and_extend_frame_neon(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_neon void vp9_rtcd(void); diff --git a/config/arm64/vpx_config.asm b/config/arm64/vpx_config.asm index 7c69642cc..e3e1529bc 100644 --- a/config/arm64/vpx_config.asm +++ b/config/arm64/vpx_config.asm @@ -20,7 +20,9 @@ .equ HAVE_SSE4_1 , 0 .equ HAVE_AVX , 0 .equ HAVE_AVX2 , 0 +.equ HAVE_AVX512 , 0 .equ HAVE_VSX , 0 +.equ HAVE_MMI , 0 .equ HAVE_VPX_PORTS , 1 .equ HAVE_PTHREAD_H , 1 .equ HAVE_UNISTD_H , 1 @@ -74,10 +76,11 @@ .equ CONFIG_TEMPORAL_DENOISING , 1 .equ CONFIG_VP9_TEMPORAL_DENOISING , 0 .equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 -.equ CONFIG_VP9_HIGHBITDEPTH , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 1 .equ CONFIG_BETTER_HW_COMPATIBILITY , 0 .equ CONFIG_EXPERIMENTAL , 0 .equ CONFIG_SIZE_LIMIT , 1 +.equ CONFIG_ALWAYS_ADJUST_BPM , 0 .equ CONFIG_SPATIAL_SVC , 0 .equ CONFIG_FP_MB_STATS , 0 .equ CONFIG_EMULATE_HARDWARE , 0 diff --git a/config/arm64/vpx_config.c b/config/arm64/vpx_config.c index ff9121723..13490c81c 100644 --- a/config/arm64/vpx_config.c +++ b/config/arm64/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--force-target=armv8-linux-gcc --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072"; +static const char* const cfg = "--force-target=armv8-linux-gcc --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/config/arm64/vpx_config.h b/config/arm64/vpx_config.h index f1acc55cc..5304f8a57 100644 --- a/config/arm64/vpx_config.h +++ b/config/arm64/vpx_config.h @@ -29,7 +29,9 @@ #define HAVE_SSE4_1 0 #define HAVE_AVX 0 #define HAVE_AVX2 0 +#define HAVE_AVX512 0 #define HAVE_VSX 0 +#define HAVE_MMI 0 #define HAVE_VPX_PORTS 1 #define HAVE_PTHREAD_H 1 #define HAVE_UNISTD_H 1 @@ -83,10 +85,11 @@ #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 -#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_VP9_HIGHBITDEPTH 1 #define CONFIG_BETTER_HW_COMPATIBILITY 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 #define CONFIG_SPATIAL_SVC 0 #define CONFIG_FP_MB_STATS 0 #define CONFIG_EMULATE_HARDWARE 0 diff --git a/config/arm64/vpx_dsp_rtcd.h b/config/arm64/vpx_dsp_rtcd.h index a915afabf..d911fd37f 100644 --- a/config/arm64/vpx_dsp_rtcd.h +++ b/config/arm64/vpx_dsp_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_DSP_RTCD_H_ #define VPX_DSP_RTCD_H_ @@ -13,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #ifdef __cplusplus @@ -28,38 +30,39 @@ unsigned int vpx_avg_8x8_neon(const uint8_t *, int p); #define vpx_avg_8x8 vpx_avg_8x8_neon void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -#define vpx_comp_avg_pred vpx_comp_avg_pred_c +void vpx_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_neon -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8 vpx_convolve8_neon -void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg vpx_convolve8_avg_neon -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_neon -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_neon -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_horiz vpx_convolve8_horiz_neon -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_vert vpx_convolve8_vert_neon -void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_avg vpx_convolve_avg_neon -void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_copy vpx_convolve_copy_neon void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -213,26 +216,32 @@ void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8 #define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_neon void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct16x16 vpx_fdct16x16_c +void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16 vpx_fdct16x16_neon void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct16x16_1 vpx_fdct16x16_1_c +void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16_1 vpx_fdct16x16_1_neon void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct32x32 vpx_fdct32x32_c +void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32 vpx_fdct32x32_neon void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct32x32_1 vpx_fdct32x32_1_c +void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_1 vpx_fdct32x32_1_neon void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c +void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_rd vpx_fdct32x32_rd_neon void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride); #define vpx_fdct4x4 vpx_fdct4x4_neon void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct4x4_1 vpx_fdct4x4_1_c +void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4_1 vpx_fdct4x4_1_neon void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride); @@ -273,17 +282,915 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_neon -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); #define vpx_hadamard_16x16 vpx_hadamard_16x16_neon -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); #define vpx_hadamard_8x8 vpx_hadamard_8x8_neon void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c +void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c + +void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c + +unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c + +unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c + +unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c + +unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c + +unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c + +unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c + +unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c + +unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c + +unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c + +unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c + +unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c + +unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c + +unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c + +unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c + +unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c + +unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c + +void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c + +void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c + +unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c + +unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c + +unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c + +unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c + +unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c + +unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c + +unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c + +unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c + +unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c + +unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c + +unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c + +unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c + +unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c + +unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c + +unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c + +unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c + +void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c + +void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c + +unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c + +unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c + +unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c + +unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c + +unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c + +unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c + +unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c + +unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c + +unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c + +unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c + +unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c + +unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c + +unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c + +unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c + +unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c + +unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c + +unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p); +#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c + +unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p); +#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c + +void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride); +#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c + +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8 vpx_highbd_convolve8_neon + +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_neon + +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_neon + +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_neon + +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_neon + +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_neon + +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_neon + +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_neon + +void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c + +void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c + +void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c + +void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c + +void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_neon + +void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_neon + +void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_neon + +void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_neon + +void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c + +void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c + +void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c + +void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c + +void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c + +void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c + +void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c + +void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c + +void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_neon + +void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_neon + +void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_neon + +void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_neon + +void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c + +void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c + +void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c + +void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c + +void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_neon + +void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_neon + +void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_neon + +void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_neon + +void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_neon + +void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_neon + +void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_neon + +void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_neon + +void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_neon + +void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_neon + +void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_neon + +void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_neon + +void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_neon + +void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_neon + +void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_neon + +void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_neon + +void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c + +void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c + +void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c + +void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c + +void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c + +void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c + +void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c + +void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8_1 vpx_fdct8x8_1_neon + +void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_neon + +void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_neon + +void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_neon + +void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_neon + +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_neon + +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_neon + +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_neon + +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_neon + +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_neon + +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_neon + +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_neon + +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_neon + +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_neon + +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_neon + +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_neon + +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_neon + +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_neon + +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c + +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c + +void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_neon + +void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_horizontal_16_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_neon + +void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_neon + +void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +void vpx_highbd_lpf_horizontal_4_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_neon + +void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_neon + +void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +void vpx_highbd_lpf_horizontal_8_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_neon + +void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_vertical_16_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_neon + +void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_vertical_16_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_neon + +void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_neon + +void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +void vpx_highbd_lpf_vertical_4_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_neon + +void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_neon + +void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +void vpx_highbd_lpf_vertical_8_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_neon + +void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c + +void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c + +void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c + +unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c + +unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c + +void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c + +unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c + +unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c + +void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c + +unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c + +unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c + +void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c + +unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c + +unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c + +void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c + +unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c + +unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c + +void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c + +unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c + +unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c + +void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c + +unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c + +unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c + +void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c + +unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c + +unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c + +void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c + +unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c + +unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c + +void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c + +unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c + +unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c + +void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c + +unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c + +unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c + +void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c + +unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c + +unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c + +void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c + +unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c + +unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c + +void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c + +void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd); +#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c + +void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_neon + +void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_neon + +void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_neon + +void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_neon + +void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_neon + +void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_neon + +void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_neon + +void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_neon + void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon @@ -416,17 +1323,20 @@ unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint #define vpx_mse8x8 vpx_mse8x8_c void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vpx_quantize_b vpx_quantize_b_c +void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_quantize_b vpx_quantize_b_neon void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c +void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_neon unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad16x16 vpx_sad16x16_neon unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad16x16_avg vpx_sad16x16_avg_c +unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x16_avg vpx_sad16x16_avg_neon void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad16x16x3 vpx_sad16x16x3_c @@ -439,223 +1349,247 @@ void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref #define vpx_sad16x16x8 vpx_sad16x16x8_c unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_sad16x32 vpx_sad16x32_c +unsigned int vpx_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x32 vpx_sad16x32_neon unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad16x32_avg vpx_sad16x32_avg_c +unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x32_avg vpx_sad16x32_avg_neon void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad16x32x4d vpx_sad16x32x4d_c +void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad16x32x4d vpx_sad16x32x4d_neon unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad16x8 vpx_sad16x8_neon unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad16x8_avg vpx_sad16x8_avg_c +unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x8_avg vpx_sad16x8_avg_neon void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad16x8x3 vpx_sad16x8x3_c void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad16x8x4d vpx_sad16x8x4d_c +void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad16x8x4d vpx_sad16x8x4d_neon void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad16x8x8 vpx_sad16x8x8_c unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_sad32x16 vpx_sad32x16_c +unsigned int vpx_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x16 vpx_sad32x16_neon unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad32x16_avg vpx_sad32x16_avg_c +unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x16_avg vpx_sad32x16_avg_neon void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad32x16x4d vpx_sad32x16x4d_c +void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad32x16x4d vpx_sad32x16x4d_neon unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad32x32 vpx_sad32x32_neon unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad32x32_avg vpx_sad32x32_avg_c - -void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x3 vpx_sad32x32x3_c +unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x32_avg vpx_sad32x32_avg_neon void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad32x32x4d vpx_sad32x32x4d_neon -void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x8 vpx_sad32x32x8_c - unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_sad32x64 vpx_sad32x64_c +unsigned int vpx_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x64 vpx_sad32x64_neon unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad32x64_avg vpx_sad32x64_avg_c +unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x64_avg vpx_sad32x64_avg_neon void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad32x64x4d vpx_sad32x64x4d_c +void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad32x64x4d vpx_sad32x64x4d_neon unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad4x4 vpx_sad4x4_neon unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad4x4_avg vpx_sad4x4_avg_c +unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x4_avg vpx_sad4x4_avg_neon void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad4x4x3 vpx_sad4x4x3_c void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad4x4x4d vpx_sad4x4x4d_c +void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad4x4x4d vpx_sad4x4x4d_neon void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad4x4x8 vpx_sad4x4x8_c unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_sad4x8 vpx_sad4x8_c +unsigned int vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x8 vpx_sad4x8_neon unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad4x8_avg vpx_sad4x8_avg_c +unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x8_avg vpx_sad4x8_avg_neon void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x4d vpx_sad4x8x4d_c - -void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x8 vpx_sad4x8x8_c +void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad4x8x4d vpx_sad4x8x4d_neon unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_sad64x32 vpx_sad64x32_c +unsigned int vpx_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad64x32 vpx_sad64x32_neon unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad64x32_avg vpx_sad64x32_avg_c +unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad64x32_avg vpx_sad64x32_avg_neon void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad64x32x4d vpx_sad64x32x4d_c +void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad64x32x4d vpx_sad64x32x4d_neon unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad64x64 vpx_sad64x64_neon unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad64x64_avg vpx_sad64x64_avg_c - -void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x3 vpx_sad64x64x3_c +unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad64x64_avg vpx_sad64x64_avg_neon void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad64x64x4d vpx_sad64x64x4d_neon -void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x8 vpx_sad64x64x8_c - unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x16 vpx_sad8x16_neon unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad8x16_avg vpx_sad8x16_avg_c +unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x16_avg vpx_sad8x16_avg_neon void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x16x3 vpx_sad8x16x3_c void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad8x16x4d vpx_sad8x16x4d_c +void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad8x16x4d vpx_sad8x16x4d_neon void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x16x8 vpx_sad8x16x8_c unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_sad8x4 vpx_sad8x4_c +unsigned int vpx_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x4 vpx_sad8x4_neon unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad8x4_avg vpx_sad8x4_avg_c +unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x4_avg vpx_sad8x4_avg_neon void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x4d vpx_sad8x4x4d_c - -void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x8 vpx_sad8x4x8_c +void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad8x4x4d vpx_sad8x4x4d_neon unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x8 vpx_sad8x8_neon unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad8x8_avg vpx_sad8x8_avg_c +unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x8_avg vpx_sad8x8_avg_neon void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x3 vpx_sad8x8x3_c void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad8x8x4d vpx_sad8x8x4d_c +void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad8x8x4d vpx_sad8x8x4d_neon void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x8 vpx_sad8x8x8_c -int vpx_satd_c(const int16_t *coeff, int length); -int vpx_satd_neon(const int16_t *coeff, int length); +int vpx_satd_c(const tran_low_t *coeff, int length); +int vpx_satd_neon(const tran_low_t *coeff, int length); #define vpx_satd vpx_satd_neon -void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vpx_scaled_2d vpx_scaled_2d_c +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_2d vpx_scaled_2d_neon -void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c -void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c -void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c -void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_horiz vpx_scaled_horiz_c -void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c +uint32_t vpx_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_neon uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c +uint32_t vpx_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_neon uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c +uint32_t vpx_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_neon uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c +uint32_t vpx_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_neon uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c +uint32_t vpx_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_neon uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c +uint32_t vpx_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_neon uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c +uint32_t vpx_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_neon uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c +uint32_t vpx_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_neon uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c +uint32_t vpx_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_neon uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c +uint32_t vpx_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_neon uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c +uint32_t vpx_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_neon uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c +uint32_t vpx_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_neon uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c +uint32_t vpx_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_neon uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); @@ -682,10 +1616,12 @@ uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_str #define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_neon uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c +uint32_t vpx_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_neon uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c +uint32_t vpx_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_neon uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); diff --git a/config/arm64/vpx_scale_rtcd.h b/config/arm64/vpx_scale_rtcd.h index a1564b7ad..b37136827 100644 --- a/config/arm64/vpx_scale_rtcd.h +++ b/config/arm64/vpx_scale_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_SCALE_RTCD_H_ #define VPX_SCALE_RTCD_H_ @@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vpx_yv12_copy_y vpx_yv12_copy_y_c diff --git a/config/arm64/vpx_version.h b/config/arm64/vpx_version.h index 24da169b4..6078bae90 100644 --- a/config/arm64/vpx_version.h +++ b/config/arm64/vpx_version.h @@ -1,7 +1,8 @@ +// This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 6 -#define VERSION_PATCH 1 +#define VERSION_MINOR 7 +#define VERSION_PATCH 0 #define VERSION_EXTRA "" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.6.1" -#define VERSION_STRING " v1.6.1" +#define VERSION_STRING_NOSP "v1.7.0" +#define VERSION_STRING " v1.7.0" diff --git a/config/generic/vp8_rtcd.h b/config/generic/vp8_rtcd.h index 1e0ff8a7e..bc3ebe8a1 100644 --- a/config/generic/vp8_rtcd.h +++ b/config/generic/vp8_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP8_RTCD_H_ #define VP8_RTCD_H_ diff --git a/config/generic/vp9_rtcd.h b/config/generic/vp9_rtcd.h index 7d0a9e2ba..45a371c2f 100644 --- a/config/generic/vp9_rtcd.h +++ b/config/generic/vp9_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP9_RTCD_H_ #define VP9_RTCD_H_ @@ -33,7 +34,7 @@ extern "C" { int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_c -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); #define vp9_block_error_fp vp9_block_error_fp_c int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); @@ -51,12 +52,42 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); #define vp9_fht8x8 vp9_fht8x8_c -int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -#define vp9_full_search_sad vp9_full_search_sad_c - void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_c +int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd); +#define vp9_highbd_block_error vp9_highbd_block_error_c + +void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c + +void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c + +void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c + +void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c + +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd); +#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c + +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c + +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c + +void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c + +void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c + +void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c + void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c @@ -75,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); -#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c - void vp9_rtcd(void); #include "vpx_config.h" diff --git a/config/generic/vpx_config.asm b/config/generic/vpx_config.asm index 5d3ae3dfc..6b173b661 100644 --- a/config/generic/vpx_config.asm +++ b/config/generic/vpx_config.asm @@ -20,7 +20,9 @@ .equ HAVE_SSE4_1 , 0 .equ HAVE_AVX , 0 .equ HAVE_AVX2 , 0 +.equ HAVE_AVX512 , 0 .equ HAVE_VSX , 0 +.equ HAVE_MMI , 0 .equ HAVE_VPX_PORTS , 1 .equ HAVE_PTHREAD_H , 1 .equ HAVE_UNISTD_H , 1 @@ -74,10 +76,11 @@ .equ CONFIG_TEMPORAL_DENOISING , 1 .equ CONFIG_VP9_TEMPORAL_DENOISING , 0 .equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 -.equ CONFIG_VP9_HIGHBITDEPTH , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 1 .equ CONFIG_BETTER_HW_COMPATIBILITY , 0 .equ CONFIG_EXPERIMENTAL , 0 .equ CONFIG_SIZE_LIMIT , 1 +.equ CONFIG_ALWAYS_ADJUST_BPM , 0 .equ CONFIG_SPATIAL_SVC , 0 .equ CONFIG_FP_MB_STATS , 0 .equ CONFIG_EMULATE_HARDWARE , 0 diff --git a/config/generic/vpx_config.c b/config/generic/vpx_config.c index c6d3e14c5..70fcdf7e3 100644 --- a/config/generic/vpx_config.c +++ b/config/generic/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=generic-gnu --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072"; +static const char* const cfg = "--target=generic-gnu --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/config/generic/vpx_config.h b/config/generic/vpx_config.h index 63c75e4f9..dc96f8743 100644 --- a/config/generic/vpx_config.h +++ b/config/generic/vpx_config.h @@ -29,7 +29,9 @@ #define HAVE_SSE4_1 0 #define HAVE_AVX 0 #define HAVE_AVX2 0 +#define HAVE_AVX512 0 #define HAVE_VSX 0 +#define HAVE_MMI 0 #define HAVE_VPX_PORTS 1 #define HAVE_PTHREAD_H 1 #define HAVE_UNISTD_H 1 @@ -83,10 +85,11 @@ #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 -#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_VP9_HIGHBITDEPTH 1 #define CONFIG_BETTER_HW_COMPATIBILITY 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 #define CONFIG_SPATIAL_SVC 0 #define CONFIG_FP_MB_STATS 0 #define CONFIG_EMULATE_HARDWARE 0 diff --git a/config/generic/vpx_dsp_rtcd.h b/config/generic/vpx_dsp_rtcd.h index ae0cea137..be38303bf 100644 --- a/config/generic/vpx_dsp_rtcd.h +++ b/config/generic/vpx_dsp_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_DSP_RTCD_H_ #define VPX_DSP_RTCD_H_ @@ -13,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #ifdef __cplusplus @@ -28,28 +30,28 @@ unsigned int vpx_avg_8x8_c(const uint8_t *, int p); void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); #define vpx_comp_avg_pred vpx_comp_avg_pred_c -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8 vpx_convolve8_c -void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg vpx_convolve8_avg_c -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_horiz vpx_convolve8_horiz_c -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_vert vpx_convolve8_vert_c -void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_avg vpx_convolve_avg_c -void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_copy vpx_convolve_copy_c void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -229,15 +231,843 @@ void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); #define vpx_hadamard_16x16 vpx_hadamard_16x16_c -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); #define vpx_hadamard_8x8 vpx_hadamard_8x8_c void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c +void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c + +void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c + +unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c + +unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c + +unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c + +unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c + +unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c + +unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c + +unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c + +unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c + +unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c + +unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c + +unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c + +unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c + +unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c + +unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c + +unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c + +unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c + +void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c + +void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c + +unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c + +unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c + +unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c + +unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c + +unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c + +unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c + +unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c + +unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c + +unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c + +unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c + +unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c + +unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c + +unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c + +unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c + +unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c + +unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c + +void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c + +void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c + +unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c + +unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c + +unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c + +unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c + +unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c + +unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c + +unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c + +unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c + +unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c + +unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c + +unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c + +unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c + +unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c + +unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c + +unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c + +unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c + +unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p); +#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c + +unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p); +#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c + +void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride); +#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c + +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8 vpx_highbd_convolve8_c + +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c + +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c + +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c + +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c + +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c + +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c + +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c + +void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c + +void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c + +void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c + +void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c + +void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c + +void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c + +void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c + +void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c + +void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c + +void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c + +void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c + +void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c + +void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c + +void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c + +void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c + +void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c + +void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c + +void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c + +void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c + +void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c + +void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c + +void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c + +void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c + +void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c + +void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c + +void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c + +void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c + +void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c + +void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c + +void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c + +void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c + +void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c + +void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c + +void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c + +void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c + +void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c + +void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c + +void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c + +void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c + +void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c + +void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c + +void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c + +void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c + +void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c + +void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c + +void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c + +void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c + +void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c + +void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c + +void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c + +void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c + +void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c + +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c + +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c + +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c + +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c + +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c + +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c + +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c + +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c + +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c + +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c + +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c + +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c + +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c + +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c + +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c + +void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c + +void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c + +void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c + +void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c + +void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c + +void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c + +void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c + +void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c + +void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c + +void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c + +void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c + +void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c + +void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c + +void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c + +void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c + +unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c + +unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c + +void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c + +unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c + +unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c + +void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c + +unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c + +unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c + +void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c + +unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c + +unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c + +void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c + +unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c + +unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c + +void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c + +unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c + +unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c + +void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c + +unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c + +unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c + +void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c + +unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c + +unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c + +void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c + +unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c + +unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c + +void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c + +unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c + +unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c + +void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c + +unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c + +unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c + +void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c + +unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c + +unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c + +void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c + +unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c + +unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c + +void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c + +void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd); +#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c + +void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c + +void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c + +void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c + +void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c + +void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c + +void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c + +void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c + +void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c + void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c @@ -400,15 +1230,9 @@ unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x32_avg vpx_sad32x32_avg_c -void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x3 vpx_sad32x32x3_c - void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad32x32x4d vpx_sad32x32x4d_c -void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x8 vpx_sad32x32x8_c - unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad32x64 vpx_sad32x64_c @@ -442,9 +1266,6 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad4x8x4d vpx_sad4x8x4d_c -void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x8 vpx_sad4x8x8_c - unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad64x32 vpx_sad64x32_c @@ -460,15 +1281,9 @@ unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x64_avg vpx_sad64x64_avg_c -void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x3 vpx_sad64x64x3_c - void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad64x64x4d vpx_sad64x64x4d_c -void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x8 vpx_sad64x64x8_c - unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x16 vpx_sad8x16_c @@ -493,9 +1308,6 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad8x4x4d vpx_sad8x4x4d_c -void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x8 vpx_sad8x4x8_c - unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x8 vpx_sad8x8_c @@ -511,25 +1323,25 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x8 vpx_sad8x8x8_c -int vpx_satd_c(const int16_t *coeff, int length); +int vpx_satd_c(const tran_low_t *coeff, int length); #define vpx_satd vpx_satd_c -void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_2d vpx_scaled_2d_c -void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c -void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c -void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c -void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_horiz vpx_scaled_horiz_c -void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); diff --git a/config/generic/vpx_scale_rtcd.h b/config/generic/vpx_scale_rtcd.h index f419cc7a5..d12f52764 100644 --- a/config/generic/vpx_scale_rtcd.h +++ b/config/generic/vpx_scale_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_SCALE_RTCD_H_ #define VPX_SCALE_RTCD_H_ @@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vpx_yv12_copy_y vpx_yv12_copy_y_c diff --git a/config/generic/vpx_version.h b/config/generic/vpx_version.h index 24da169b4..6078bae90 100644 --- a/config/generic/vpx_version.h +++ b/config/generic/vpx_version.h @@ -1,7 +1,8 @@ +// This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 6 -#define VERSION_PATCH 1 +#define VERSION_MINOR 7 +#define VERSION_PATCH 0 #define VERSION_EXTRA "" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.6.1" -#define VERSION_STRING " v1.6.1" +#define VERSION_STRING_NOSP "v1.7.0" +#define VERSION_STRING " v1.7.0" diff --git a/config/mips32-dspr2/vp8_rtcd.h b/config/mips32-dspr2/vp8_rtcd.h index a940d3594..e24387399 100644 --- a/config/mips32-dspr2/vp8_rtcd.h +++ b/config/mips32-dspr2/vp8_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP8_RTCD_H_ #define VP8_RTCD_H_ diff --git a/config/mips32-dspr2/vp9_rtcd.h b/config/mips32-dspr2/vp9_rtcd.h index 2e161c181..91d3a1aab 100644 --- a/config/mips32-dspr2/vp9_rtcd.h +++ b/config/mips32-dspr2/vp9_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP9_RTCD_H_ #define VP9_RTCD_H_ @@ -33,7 +34,7 @@ extern "C" { int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_c -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); #define vp9_block_error_fp vp9_block_error_fp_c int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); @@ -51,23 +52,50 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); #define vp9_fht8x8 vp9_fht8x8_c -int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -#define vp9_full_search_sad vp9_full_search_sad_c - void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_c +int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd); +#define vp9_highbd_block_error vp9_highbd_block_error_c + +void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c + +void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c + +void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c + +void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c + +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd); +#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c + +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c + +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c + +void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c + +void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c + +void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c + void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); -void vp9_iht16x16_256_add_dspr2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); -#define vp9_iht16x16_256_add vp9_iht16x16_256_add_dspr2 +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -void vp9_iht4x4_16_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -#define vp9_iht4x4_16_add vp9_iht4x4_16_add_dspr2 +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -void vp9_iht8x8_64_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -#define vp9_iht8x8_64_add vp9_iht8x8_64_add_dspr2 +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp vp9_quantize_fp_c @@ -78,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); -#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c - void vp9_rtcd(void); #include "vpx_config.h" diff --git a/config/mips32-dspr2/vpx_config.c b/config/mips32-dspr2/vpx_config.c index 1aa002457..0471682f6 100644 --- a/config/mips32-dspr2/vpx_config.c +++ b/config/mips32-dspr2/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=mips32-linux-gcc --enable-dspr2 --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072"; +static const char* const cfg = "--target=mips32-linux-gcc --enable-dspr2 --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/config/mips32-dspr2/vpx_config.h b/config/mips32-dspr2/vpx_config.h index 6df484f60..9bdc19616 100644 --- a/config/mips32-dspr2/vpx_config.h +++ b/config/mips32-dspr2/vpx_config.h @@ -29,7 +29,9 @@ #define HAVE_SSE4_1 0 #define HAVE_AVX 0 #define HAVE_AVX2 0 +#define HAVE_AVX512 0 #define HAVE_VSX 0 +#define HAVE_MMI 0 #define HAVE_VPX_PORTS 1 #define HAVE_PTHREAD_H 1 #define HAVE_UNISTD_H 1 @@ -83,10 +85,11 @@ #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 -#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_VP9_HIGHBITDEPTH 1 #define CONFIG_BETTER_HW_COMPATIBILITY 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 #define CONFIG_SPATIAL_SVC 0 #define CONFIG_FP_MB_STATS 0 #define CONFIG_EMULATE_HARDWARE 0 diff --git a/config/mips32-dspr2/vpx_dsp_rtcd.h b/config/mips32-dspr2/vpx_dsp_rtcd.h index cdb0cfc6e..bd4acd0ff 100644 --- a/config/mips32-dspr2/vpx_dsp_rtcd.h +++ b/config/mips32-dspr2/vpx_dsp_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_DSP_RTCD_H_ #define VPX_DSP_RTCD_H_ @@ -13,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #ifdef __cplusplus @@ -28,36 +30,36 @@ unsigned int vpx_avg_8x8_c(const uint8_t *, int p); void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); #define vpx_comp_avg_pred vpx_comp_avg_pred_c -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8 vpx_convolve8_dspr2 -void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg vpx_convolve8_avg_dspr2 -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_dspr2 -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_dspr2 -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_horiz vpx_convolve8_horiz_dspr2 -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_vert vpx_convolve8_vert_dspr2 -void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_avg vpx_convolve_avg_dspr2 -void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_copy vpx_convolve_copy_dspr2 void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -243,66 +245,881 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_dspr2 -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); #define vpx_hadamard_16x16 vpx_hadamard_16x16_c -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); #define vpx_hadamard_8x8 vpx_hadamard_8x8_c void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c +void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c + +void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c + +unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c + +unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c + +unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c + +unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c + +unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c + +unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c + +unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c + +unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c + +unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c + +unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c + +unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c + +unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c + +unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c + +unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c + +unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c + +unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c + +void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c + +void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c + +unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c + +unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c + +unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c + +unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c + +unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c + +unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c + +unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c + +unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c + +unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c + +unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c + +unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c + +unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c + +unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c + +unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c + +unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c + +unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c + +void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c + +void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c + +unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c + +unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c + +unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c + +unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c + +unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c + +unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c + +unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c + +unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c + +unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c + +unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c + +unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c + +unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c + +unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c + +unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c + +unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c + +unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c + +unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p); +#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c + +unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p); +#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c + +void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride); +#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c + +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8 vpx_highbd_convolve8_c + +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c + +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c + +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c + +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c + +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c + +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c + +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c + +void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c + +void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c + +void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c + +void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c + +void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c + +void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c + +void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c + +void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c + +void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c + +void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c + +void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c + +void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c + +void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c + +void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c + +void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c + +void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c + +void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c + +void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c + +void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c + +void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c + +void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c + +void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c + +void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c + +void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c + +void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c + +void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c + +void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c + +void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c + +void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c + +void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c + +void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c + +void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c + +void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c + +void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c + +void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c + +void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c + +void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c + +void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c + +void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c + +void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c + +void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c + +void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c + +void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c + +void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c + +void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c + +void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c + +void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c + +void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c + +void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c + +void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c + +void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c + +void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c + +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c + +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c + +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c + +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c + +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c + +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c + +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c + +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c + +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c + +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c + +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c + +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c + +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c + +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c + +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c + +void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c + +void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c + +void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c + +void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c + +void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c + +void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c + +void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c + +void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c + +void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c + +void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c + +void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c + +void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c + +void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c + +void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c + +void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c + +unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c + +unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c + +void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c + +unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c + +unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c + +void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c + +unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c + +unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c + +void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c + +unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c + +unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c + +void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c + +unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c + +unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c + +void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c + +unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c + +unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c + +void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c + +unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c + +unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c + +void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c + +unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c + +unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c + +void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c + +unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c + +unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c + +void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c + +unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c + +unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c + +void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c + +unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c + +unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c + +void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c + +unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c + +unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c + +void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c + +unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c + +unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c + +void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c + +void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd); +#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c + +void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c + +void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c + +void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c + +void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c + +void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c + +void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c + +void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c + +void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c + void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_10_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct16x16_10_add vpx_idct16x16_10_add_dspr2 +#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_1_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct16x16_1_add vpx_idct16x16_1_add_dspr2 +#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_256_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct16x16_256_add vpx_idct16x16_256_add_dspr2 +#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_256_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct16x16_38_add vpx_idct16x16_256_add_dspr2 +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_c void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_dspr2 +#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_dspr2 +#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct32x32_1_add vpx_idct32x32_1_add_dspr2 +#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_34_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct32x32_34_add vpx_idct32x32_34_add_dspr2 +#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct4x4_16_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct4x4_16_add vpx_idct4x4_16_add_dspr2 +#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct4x4_1_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct4x4_1_add vpx_idct4x4_1_add_dspr2 +#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_12_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct8x8_12_add vpx_idct8x8_12_add_dspr2 +#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_1_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct8x8_1_add vpx_idct8x8_1_add_dspr2 +#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_64_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct8x8_64_add vpx_idct8x8_64_add_dspr2 +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); #define vpx_int_pro_col vpx_int_pro_col_c @@ -439,15 +1256,9 @@ unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x32_avg vpx_sad32x32_avg_c -void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x3 vpx_sad32x32x3_c - void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad32x32x4d vpx_sad32x32x4d_c -void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x8 vpx_sad32x32x8_c - unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad32x64 vpx_sad32x64_c @@ -481,9 +1292,6 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad4x8x4d vpx_sad4x8x4d_c -void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x8 vpx_sad4x8x8_c - unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad64x32 vpx_sad64x32_c @@ -499,15 +1307,9 @@ unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x64_avg vpx_sad64x64_avg_c -void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x3 vpx_sad64x64x3_c - void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad64x64x4d vpx_sad64x64x4d_c -void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x8 vpx_sad64x64x8_c - unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x16 vpx_sad8x16_c @@ -532,9 +1334,6 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad8x4x4d vpx_sad8x4x4d_c -void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x8 vpx_sad8x4x8_c - unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x8 vpx_sad8x8_c @@ -550,25 +1349,25 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x8 vpx_sad8x8x8_c -int vpx_satd_c(const int16_t *coeff, int length); +int vpx_satd_c(const tran_low_t *coeff, int length); #define vpx_satd vpx_satd_c -void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_2d vpx_scaled_2d_c -void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c -void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c -void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c -void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_horiz vpx_scaled_horiz_c -void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); diff --git a/config/mips32-dspr2/vpx_scale_rtcd.h b/config/mips32-dspr2/vpx_scale_rtcd.h index 15b1b5a6f..487bc29b8 100644 --- a/config/mips32-dspr2/vpx_scale_rtcd.h +++ b/config/mips32-dspr2/vpx_scale_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_SCALE_RTCD_H_ #define VPX_SCALE_RTCD_H_ @@ -48,6 +49,9 @@ void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); void vpx_extend_frame_inner_borders_dspr2(struct yv12_buffer_config *ybf); #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_dspr2 +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vpx_yv12_copy_y vpx_yv12_copy_y_c diff --git a/config/mips32-dspr2/vpx_version.h b/config/mips32-dspr2/vpx_version.h index 24da169b4..6078bae90 100644 --- a/config/mips32-dspr2/vpx_version.h +++ b/config/mips32-dspr2/vpx_version.h @@ -1,7 +1,8 @@ +// This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 6 -#define VERSION_PATCH 1 +#define VERSION_MINOR 7 +#define VERSION_PATCH 0 #define VERSION_EXTRA "" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.6.1" -#define VERSION_STRING " v1.6.1" +#define VERSION_STRING_NOSP "v1.7.0" +#define VERSION_STRING " v1.7.0" diff --git a/config/mips32-msa/vp8_rtcd.h b/config/mips32-msa/vp8_rtcd.h index a851d7f13..00469b064 100644 --- a/config/mips32-msa/vp8_rtcd.h +++ b/config/mips32-msa/vp8_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP8_RTCD_H_ #define VP8_RTCD_H_ diff --git a/config/mips32-msa/vp9_rtcd.h b/config/mips32-msa/vp9_rtcd.h index d0adf351e..91d3a1aab 100644 --- a/config/mips32-msa/vp9_rtcd.h +++ b/config/mips32-msa/vp9_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP9_RTCD_H_ #define VP9_RTCD_H_ @@ -31,10 +32,9 @@ extern "C" { #endif int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); -int64_t vp9_block_error_msa(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); -#define vp9_block_error vp9_block_error_msa +#define vp9_block_error vp9_block_error_c -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); #define vp9_block_error_fp vp9_block_error_fp_c int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); @@ -44,35 +44,58 @@ void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr #define vp9_fdct8x8_quant vp9_fdct8x8_quant_c void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); -void vp9_fht16x16_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type); -#define vp9_fht16x16 vp9_fht16x16_msa +#define vp9_fht16x16 vp9_fht16x16_c void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); -void vp9_fht4x4_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type); -#define vp9_fht4x4 vp9_fht4x4_msa +#define vp9_fht4x4 vp9_fht4x4_c void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); -void vp9_fht8x8_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type); -#define vp9_fht8x8 vp9_fht8x8_msa - -int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -#define vp9_full_search_sad vp9_full_search_sad_c +#define vp9_fht8x8 vp9_fht8x8_c void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); -void vp9_fwht4x4_msa(const int16_t *input, tran_low_t *output, int stride); -#define vp9_fwht4x4 vp9_fwht4x4_msa +#define vp9_fwht4x4 vp9_fwht4x4_c + +int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd); +#define vp9_highbd_block_error vp9_highbd_block_error_c + +void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c + +void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c + +void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c + +void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c + +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd); +#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c + +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c + +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c + +void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c + +void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c + +void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); -void vp9_iht16x16_256_add_msa(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); -#define vp9_iht16x16_256_add vp9_iht16x16_256_add_msa +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -void vp9_iht4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -#define vp9_iht4x4_16_add vp9_iht4x4_16_add_msa +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -void vp9_iht8x8_64_add_msa(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -#define vp9_iht8x8_64_add vp9_iht8x8_64_add_msa +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp vp9_quantize_fp_c @@ -83,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); -#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c - void vp9_rtcd(void); #include "vpx_config.h" diff --git a/config/mips32-msa/vpx_config.c b/config/mips32-msa/vpx_config.c index 21f7c3b82..737e5530b 100644 --- a/config/mips32-msa/vpx_config.c +++ b/config/mips32-msa/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=mips32-linux-gcc --enable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072"; +static const char* const cfg = "--target=mips32-linux-gcc --enable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/config/mips32-msa/vpx_config.h b/config/mips32-msa/vpx_config.h index 53831030d..9ca17a018 100644 --- a/config/mips32-msa/vpx_config.h +++ b/config/mips32-msa/vpx_config.h @@ -29,7 +29,9 @@ #define HAVE_SSE4_1 0 #define HAVE_AVX 0 #define HAVE_AVX2 0 +#define HAVE_AVX512 0 #define HAVE_VSX 0 +#define HAVE_MMI 0 #define HAVE_VPX_PORTS 1 #define HAVE_PTHREAD_H 1 #define HAVE_UNISTD_H 1 @@ -83,10 +85,11 @@ #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 -#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_VP9_HIGHBITDEPTH 1 #define CONFIG_BETTER_HW_COMPATIBILITY 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 #define CONFIG_SPATIAL_SVC 0 #define CONFIG_FP_MB_STATS 0 #define CONFIG_EMULATE_HARDWARE 0 diff --git a/config/mips32-msa/vpx_dsp_rtcd.h b/config/mips32-msa/vpx_dsp_rtcd.h index 22c63bfbc..4558d6960 100644 --- a/config/mips32-msa/vpx_dsp_rtcd.h +++ b/config/mips32-msa/vpx_dsp_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_DSP_RTCD_H_ #define VPX_DSP_RTCD_H_ @@ -13,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #ifdef __cplusplus @@ -30,36 +32,36 @@ unsigned int vpx_avg_8x8_msa(const uint8_t *, int p); void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); #define vpx_comp_avg_pred vpx_comp_avg_pred_c -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8 vpx_convolve8_msa -void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg vpx_convolve8_avg_msa -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_msa -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_msa -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_horiz vpx_convolve8_horiz_msa -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_vert vpx_convolve8_vert_msa -void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_avg vpx_convolve_avg_msa -void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_copy vpx_convolve_copy_msa void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -205,35 +207,28 @@ void vpx_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, const uint8_ #define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_msa void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); -void vpx_fdct16x16_msa(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct16x16 vpx_fdct16x16_msa +#define vpx_fdct16x16 vpx_fdct16x16_c void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); -void vpx_fdct16x16_1_msa(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct16x16_1 vpx_fdct16x16_1_msa +#define vpx_fdct16x16_1 vpx_fdct16x16_1_c void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); -void vpx_fdct32x32_msa(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct32x32 vpx_fdct32x32_msa +#define vpx_fdct32x32 vpx_fdct32x32_c void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); -void vpx_fdct32x32_1_msa(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct32x32_1 vpx_fdct32x32_1_msa +#define vpx_fdct32x32_1 vpx_fdct32x32_1_c void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); -void vpx_fdct32x32_rd_msa(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct32x32_rd vpx_fdct32x32_rd_msa +#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); -void vpx_fdct4x4_msa(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct4x4 vpx_fdct4x4_msa +#define vpx_fdct4x4 vpx_fdct4x4_c void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); #define vpx_fdct4x4_1 vpx_fdct4x4_1_c void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); -void vpx_fdct8x8_msa(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct8x8 vpx_fdct8x8_msa +#define vpx_fdct8x8 vpx_fdct8x8_c void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct8x8_1_msa(const int16_t *input, tran_low_t *output, int stride); @@ -271,68 +266,881 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vpx_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_msa -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_16x16_msa(const int16_t *src_diff, int src_stride, int16_t *coeff); -#define vpx_hadamard_16x16 vpx_hadamard_16x16_msa +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +#define vpx_hadamard_16x16 vpx_hadamard_16x16_c -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_8x8_msa(const int16_t *src_diff, int src_stride, int16_t *coeff); -#define vpx_hadamard_8x8 vpx_hadamard_8x8_msa +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +#define vpx_hadamard_8x8 vpx_hadamard_8x8_c void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c +void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c + +void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c + +unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c + +unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c + +unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c + +unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c + +unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c + +unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c + +unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c + +unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c + +unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c + +unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c + +unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c + +unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c + +unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c + +unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c + +unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c + +unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c + +void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c + +void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c + +unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c + +unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c + +unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c + +unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c + +unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c + +unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c + +unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c + +unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c + +unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c + +unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c + +unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c + +unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c + +unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c + +unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c + +unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c + +unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c + +void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c + +void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c + +unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c + +unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c + +unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c + +unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c + +unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c + +unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c + +unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c + +unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c + +unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c + +unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c + +unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c + +unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c + +unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c + +unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c + +unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c + +unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c + +unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p); +#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c + +unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p); +#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c + +void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride); +#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c + +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8 vpx_highbd_convolve8_c + +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c + +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c + +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c + +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c + +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c + +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c + +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c + +void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c + +void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c + +void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c + +void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c + +void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c + +void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c + +void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c + +void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c + +void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c + +void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c + +void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c + +void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c + +void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c + +void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c + +void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c + +void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c + +void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c + +void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c + +void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c + +void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c + +void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c + +void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c + +void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c + +void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c + +void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c + +void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c + +void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c + +void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c + +void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c + +void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c + +void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c + +void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c + +void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c + +void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c + +void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c + +void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c + +void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c + +void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c + +void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c + +void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c + +void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c + +void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c + +void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c + +void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c + +void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c + +void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c + +void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c + +void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c + +void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c + +void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c + +void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c + +void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c + +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c + +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c + +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c + +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c + +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c + +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c + +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c + +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c + +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c + +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c + +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c + +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c + +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c + +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c + +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c + +void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c + +void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c + +void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c + +void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c + +void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c + +void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c + +void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c + +void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c + +void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c + +void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c + +void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c + +void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c + +void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c + +void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c + +void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c + +unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c + +unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c + +void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c + +unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c + +unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c + +void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c + +unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c + +unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c + +void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c + +unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c + +unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c + +void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c + +unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c + +unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c + +void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c + +unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c + +unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c + +void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c + +unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c + +unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c + +void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c + +unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c + +unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c + +void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c + +unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c + +unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c + +void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c + +unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c + +unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c + +void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c + +unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c + +unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c + +void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c + +unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c + +unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c + +void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c + +unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c + +unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c + +void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c + +void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd); +#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c + +void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c + +void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c + +void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c + +void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c + +void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c + +void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c + +void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c + +void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c + void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_10_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct16x16_10_add vpx_idct16x16_10_add_msa +#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct16x16_1_add vpx_idct16x16_1_add_msa +#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_256_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct16x16_256_add vpx_idct16x16_256_add_msa +#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_256_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct16x16_38_add vpx_idct16x16_256_add_msa +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_c void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_msa +#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_msa +#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct32x32_1_add vpx_idct32x32_1_add_msa +#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_34_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct32x32_34_add vpx_idct32x32_34_add_msa +#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct4x4_16_add vpx_idct4x4_16_add_msa +#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct4x4_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct4x4_1_add vpx_idct4x4_1_add_msa +#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_12_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct8x8_12_add vpx_idct8x8_12_add_msa +#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct8x8_1_add vpx_idct8x8_1_add_msa +#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_64_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct8x8_64_add vpx_idct8x8_64_add_msa +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width); @@ -343,12 +1151,10 @@ void vpx_int_pro_row_msa(int16_t *hbuf, const uint8_t *ref, const int ref_stride #define vpx_int_pro_row vpx_int_pro_row_msa void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_iwht4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_msa +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_iwht4x4_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_msa +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); void vpx_lpf_horizontal_16_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); @@ -496,18 +1302,10 @@ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad32x32_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x32_avg vpx_sad32x32_avg_msa -void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad32x32x3_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x3 vpx_sad32x32x3_msa - void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); void vpx_sad32x32x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad32x32x4d vpx_sad32x32x4d_msa -void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad32x32x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x8 vpx_sad32x32x8_msa - unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x64_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad32x64 vpx_sad32x64_msa @@ -552,10 +1350,6 @@ void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad4x8x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad4x8x4d vpx_sad4x8x4d_msa -void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad4x8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x8 vpx_sad4x8x8_msa - unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad64x32 vpx_sad64x32_msa @@ -576,18 +1370,10 @@ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad64x64_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x64_avg vpx_sad64x64_avg_msa -void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad64x64x3_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x3 vpx_sad64x64x3_msa - void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); void vpx_sad64x64x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad64x64x4d vpx_sad64x64x4d_msa -void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad64x64x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x8 vpx_sad64x64x8_msa - unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x16 vpx_sad8x16_msa @@ -620,10 +1406,6 @@ void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad8x4x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad8x4x4d vpx_sad8x4x4d_msa -void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x4x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x8 vpx_sad8x4x8_msa - unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x8 vpx_sad8x8_msa @@ -644,26 +1426,26 @@ void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p void vpx_sad8x8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x8 vpx_sad8x8x8_msa -int vpx_satd_c(const int16_t *coeff, int length); -int vpx_satd_msa(const int16_t *coeff, int length); -#define vpx_satd vpx_satd_msa +int vpx_satd_c(const tran_low_t *coeff, int length); +#define vpx_satd vpx_satd_c -void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vpx_scaled_2d vpx_scaled_2d_c +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_2d vpx_scaled_2d_msa -void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c -void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c -void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c -void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_horiz vpx_scaled_horiz_c -void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); diff --git a/config/mips32-msa/vpx_scale_rtcd.h b/config/mips32-msa/vpx_scale_rtcd.h index ea70efc9d..eb6c009e1 100644 --- a/config/mips32-msa/vpx_scale_rtcd.h +++ b/config/mips32-msa/vpx_scale_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_SCALE_RTCD_H_ #define VPX_SCALE_RTCD_H_ @@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vpx_yv12_copy_y vpx_yv12_copy_y_c diff --git a/config/mips32-msa/vpx_version.h b/config/mips32-msa/vpx_version.h index 24da169b4..6078bae90 100644 --- a/config/mips32-msa/vpx_version.h +++ b/config/mips32-msa/vpx_version.h @@ -1,7 +1,8 @@ +// This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 6 -#define VERSION_PATCH 1 +#define VERSION_MINOR 7 +#define VERSION_PATCH 0 #define VERSION_EXTRA "" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.6.1" -#define VERSION_STRING " v1.6.1" +#define VERSION_STRING_NOSP "v1.7.0" +#define VERSION_STRING " v1.7.0" diff --git a/config/mips32/vp8_rtcd.h b/config/mips32/vp8_rtcd.h index 21dfa5a25..fbd444b8a 100644 --- a/config/mips32/vp8_rtcd.h +++ b/config/mips32/vp8_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP8_RTCD_H_ #define VP8_RTCD_H_ diff --git a/config/mips32/vp9_rtcd.h b/config/mips32/vp9_rtcd.h index c17a21721..91d3a1aab 100644 --- a/config/mips32/vp9_rtcd.h +++ b/config/mips32/vp9_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP9_RTCD_H_ #define VP9_RTCD_H_ @@ -33,7 +34,7 @@ extern "C" { int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_c -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); #define vp9_block_error_fp vp9_block_error_fp_c int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); @@ -51,12 +52,42 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); #define vp9_fht8x8 vp9_fht8x8_c -int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -#define vp9_full_search_sad vp9_full_search_sad_c - void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_c +int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd); +#define vp9_highbd_block_error vp9_highbd_block_error_c + +void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c + +void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c + +void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c + +void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c + +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd); +#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c + +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c + +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c + +void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c + +void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c + +void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c + void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c @@ -75,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); -#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c - void vp9_rtcd(void); #include "vpx_config.h" diff --git a/config/mips32/vpx_config.c b/config/mips32/vpx_config.c index e2703b374..f66993f87 100644 --- a/config/mips32/vpx_config.c +++ b/config/mips32/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=mips32-linux-gcc --disable-dspr2 --disable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072"; +static const char* const cfg = "--target=mips32-linux-gcc --disable-dspr2 --disable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/config/mips32/vpx_config.h b/config/mips32/vpx_config.h index beaa2f86c..d3ecfe8e7 100644 --- a/config/mips32/vpx_config.h +++ b/config/mips32/vpx_config.h @@ -29,7 +29,9 @@ #define HAVE_SSE4_1 0 #define HAVE_AVX 0 #define HAVE_AVX2 0 +#define HAVE_AVX512 0 #define HAVE_VSX 0 +#define HAVE_MMI 0 #define HAVE_VPX_PORTS 1 #define HAVE_PTHREAD_H 1 #define HAVE_UNISTD_H 1 @@ -83,10 +85,11 @@ #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 -#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_VP9_HIGHBITDEPTH 1 #define CONFIG_BETTER_HW_COMPATIBILITY 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 #define CONFIG_SPATIAL_SVC 0 #define CONFIG_FP_MB_STATS 0 #define CONFIG_EMULATE_HARDWARE 0 diff --git a/config/mips32/vpx_dsp_rtcd.h b/config/mips32/vpx_dsp_rtcd.h index 1b15aadba..fbb38953d 100644 --- a/config/mips32/vpx_dsp_rtcd.h +++ b/config/mips32/vpx_dsp_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_DSP_RTCD_H_ #define VPX_DSP_RTCD_H_ @@ -13,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #ifdef __cplusplus @@ -28,28 +30,28 @@ unsigned int vpx_avg_8x8_c(const uint8_t *, int p); void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); #define vpx_comp_avg_pred vpx_comp_avg_pred_c -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8 vpx_convolve8_c -void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg vpx_convolve8_avg_c -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_horiz vpx_convolve8_horiz_c -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_vert vpx_convolve8_vert_c -void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_avg vpx_convolve_avg_c -void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_copy vpx_convolve_copy_c void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -229,15 +231,843 @@ void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); #define vpx_hadamard_16x16 vpx_hadamard_16x16_c -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); #define vpx_hadamard_8x8 vpx_hadamard_8x8_c void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c +void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c + +void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c + +unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c + +unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c + +unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c + +unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c + +unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c + +unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c + +unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c + +unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c + +unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c + +unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c + +unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c + +unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c + +unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c + +unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c + +unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c + +unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c + +void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c + +void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c + +unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c + +unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c + +unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c + +unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c + +unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c + +unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c + +unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c + +unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c + +unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c + +unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c + +unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c + +unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c + +unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c + +unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c + +unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c + +unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c + +void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c + +void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c + +unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c + +unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c + +unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c + +unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c + +unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c + +unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c + +unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c + +unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c + +unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c + +unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c + +unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c + +unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c + +unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c + +unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c + +unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c + +unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c + +unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p); +#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c + +unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p); +#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c + +void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride); +#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c + +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8 vpx_highbd_convolve8_c + +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c + +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c + +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c + +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c + +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c + +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c + +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c + +void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c + +void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c + +void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c + +void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c + +void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c + +void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c + +void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c + +void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c + +void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c + +void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c + +void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c + +void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c + +void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c + +void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c + +void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c + +void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c + +void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c + +void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c + +void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c + +void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c + +void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c + +void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c + +void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c + +void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c + +void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c + +void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c + +void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c + +void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c + +void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c + +void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c + +void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c + +void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c + +void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c + +void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c + +void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c + +void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c + +void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c + +void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c + +void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c + +void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c + +void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c + +void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c + +void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c + +void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c + +void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c + +void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c + +void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c + +void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c + +void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c + +void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c + +void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c + +void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c + +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c + +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c + +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c + +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c + +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c + +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c + +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c + +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c + +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c + +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c + +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c + +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c + +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c + +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c + +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c + +void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c + +void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c + +void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c + +void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c + +void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c + +void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c + +void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c + +void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c + +void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c + +void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c + +void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c + +void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c + +void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c + +void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c + +void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c + +unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c + +unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c + +void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c + +unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c + +unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c + +void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c + +unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c + +unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c + +void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c + +unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c + +unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c + +void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c + +unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c + +unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c + +void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c + +unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c + +unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c + +void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c + +unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c + +unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c + +void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c + +unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c + +unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c + +void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c + +unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c + +unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c + +void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c + +unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c + +unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c + +void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c + +unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c + +unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c + +void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c + +unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c + +unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c + +void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c + +unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c + +unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c + +void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c + +void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd); +#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c + +void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c + +void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c + +void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c + +void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c + +void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c + +void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c + +void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c + +void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c + void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c @@ -400,15 +1230,9 @@ unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x32_avg vpx_sad32x32_avg_c -void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x3 vpx_sad32x32x3_c - void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad32x32x4d vpx_sad32x32x4d_c -void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x8 vpx_sad32x32x8_c - unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad32x64 vpx_sad32x64_c @@ -442,9 +1266,6 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad4x8x4d vpx_sad4x8x4d_c -void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x8 vpx_sad4x8x8_c - unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad64x32 vpx_sad64x32_c @@ -460,15 +1281,9 @@ unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x64_avg vpx_sad64x64_avg_c -void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x3 vpx_sad64x64x3_c - void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad64x64x4d vpx_sad64x64x4d_c -void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x8 vpx_sad64x64x8_c - unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x16 vpx_sad8x16_c @@ -493,9 +1308,6 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad8x4x4d vpx_sad8x4x4d_c -void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x8 vpx_sad8x4x8_c - unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x8 vpx_sad8x8_c @@ -511,25 +1323,25 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x8 vpx_sad8x8x8_c -int vpx_satd_c(const int16_t *coeff, int length); +int vpx_satd_c(const tran_low_t *coeff, int length); #define vpx_satd vpx_satd_c -void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_2d vpx_scaled_2d_c -void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c -void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c -void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c -void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_horiz vpx_scaled_horiz_c -void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); diff --git a/config/mips32/vpx_scale_rtcd.h b/config/mips32/vpx_scale_rtcd.h index ea70efc9d..eb6c009e1 100644 --- a/config/mips32/vpx_scale_rtcd.h +++ b/config/mips32/vpx_scale_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_SCALE_RTCD_H_ #define VPX_SCALE_RTCD_H_ @@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vpx_yv12_copy_y vpx_yv12_copy_y_c diff --git a/config/mips32/vpx_version.h b/config/mips32/vpx_version.h index 24da169b4..6078bae90 100644 --- a/config/mips32/vpx_version.h +++ b/config/mips32/vpx_version.h @@ -1,7 +1,8 @@ +// This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 6 -#define VERSION_PATCH 1 +#define VERSION_MINOR 7 +#define VERSION_PATCH 0 #define VERSION_EXTRA "" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.6.1" -#define VERSION_STRING " v1.6.1" +#define VERSION_STRING_NOSP "v1.7.0" +#define VERSION_STRING " v1.7.0" diff --git a/config/mips64-msa/vp8_rtcd.h b/config/mips64-msa/vp8_rtcd.h index a851d7f13..00469b064 100644 --- a/config/mips64-msa/vp8_rtcd.h +++ b/config/mips64-msa/vp8_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP8_RTCD_H_ #define VP8_RTCD_H_ diff --git a/config/mips64-msa/vp9_rtcd.h b/config/mips64-msa/vp9_rtcd.h index d0adf351e..91d3a1aab 100644 --- a/config/mips64-msa/vp9_rtcd.h +++ b/config/mips64-msa/vp9_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP9_RTCD_H_ #define VP9_RTCD_H_ @@ -31,10 +32,9 @@ extern "C" { #endif int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); -int64_t vp9_block_error_msa(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); -#define vp9_block_error vp9_block_error_msa +#define vp9_block_error vp9_block_error_c -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); #define vp9_block_error_fp vp9_block_error_fp_c int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); @@ -44,35 +44,58 @@ void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr #define vp9_fdct8x8_quant vp9_fdct8x8_quant_c void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); -void vp9_fht16x16_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type); -#define vp9_fht16x16 vp9_fht16x16_msa +#define vp9_fht16x16 vp9_fht16x16_c void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); -void vp9_fht4x4_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type); -#define vp9_fht4x4 vp9_fht4x4_msa +#define vp9_fht4x4 vp9_fht4x4_c void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); -void vp9_fht8x8_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type); -#define vp9_fht8x8 vp9_fht8x8_msa - -int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -#define vp9_full_search_sad vp9_full_search_sad_c +#define vp9_fht8x8 vp9_fht8x8_c void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); -void vp9_fwht4x4_msa(const int16_t *input, tran_low_t *output, int stride); -#define vp9_fwht4x4 vp9_fwht4x4_msa +#define vp9_fwht4x4 vp9_fwht4x4_c + +int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd); +#define vp9_highbd_block_error vp9_highbd_block_error_c + +void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c + +void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c + +void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c + +void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c + +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd); +#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c + +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c + +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c + +void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c + +void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c + +void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); -void vp9_iht16x16_256_add_msa(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); -#define vp9_iht16x16_256_add vp9_iht16x16_256_add_msa +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -void vp9_iht4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -#define vp9_iht4x4_16_add vp9_iht4x4_16_add_msa +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -void vp9_iht8x8_64_add_msa(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -#define vp9_iht8x8_64_add vp9_iht8x8_64_add_msa +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp vp9_quantize_fp_c @@ -83,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); -#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c - void vp9_rtcd(void); #include "vpx_config.h" diff --git a/config/mips64-msa/vpx_config.c b/config/mips64-msa/vpx_config.c index c1ab4fb90..9665244e3 100644 --- a/config/mips64-msa/vpx_config.c +++ b/config/mips64-msa/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=mips64-linux-gcc --enable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072"; +static const char* const cfg = "--target=mips64-linux-gcc --enable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/config/mips64-msa/vpx_config.h b/config/mips64-msa/vpx_config.h index ea5e8bd43..df9ed4455 100644 --- a/config/mips64-msa/vpx_config.h +++ b/config/mips64-msa/vpx_config.h @@ -29,7 +29,9 @@ #define HAVE_SSE4_1 0 #define HAVE_AVX 0 #define HAVE_AVX2 0 +#define HAVE_AVX512 0 #define HAVE_VSX 0 +#define HAVE_MMI 0 #define HAVE_VPX_PORTS 1 #define HAVE_PTHREAD_H 1 #define HAVE_UNISTD_H 1 @@ -83,10 +85,11 @@ #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 -#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_VP9_HIGHBITDEPTH 1 #define CONFIG_BETTER_HW_COMPATIBILITY 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 #define CONFIG_SPATIAL_SVC 0 #define CONFIG_FP_MB_STATS 0 #define CONFIG_EMULATE_HARDWARE 0 diff --git a/config/mips64-msa/vpx_dsp_rtcd.h b/config/mips64-msa/vpx_dsp_rtcd.h index 22c63bfbc..4558d6960 100644 --- a/config/mips64-msa/vpx_dsp_rtcd.h +++ b/config/mips64-msa/vpx_dsp_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_DSP_RTCD_H_ #define VPX_DSP_RTCD_H_ @@ -13,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #ifdef __cplusplus @@ -30,36 +32,36 @@ unsigned int vpx_avg_8x8_msa(const uint8_t *, int p); void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); #define vpx_comp_avg_pred vpx_comp_avg_pred_c -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8 vpx_convolve8_msa -void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg vpx_convolve8_avg_msa -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_msa -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_msa -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_horiz vpx_convolve8_horiz_msa -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_vert vpx_convolve8_vert_msa -void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_avg vpx_convolve_avg_msa -void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_copy vpx_convolve_copy_msa void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -205,35 +207,28 @@ void vpx_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, const uint8_ #define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_msa void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); -void vpx_fdct16x16_msa(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct16x16 vpx_fdct16x16_msa +#define vpx_fdct16x16 vpx_fdct16x16_c void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); -void vpx_fdct16x16_1_msa(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct16x16_1 vpx_fdct16x16_1_msa +#define vpx_fdct16x16_1 vpx_fdct16x16_1_c void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); -void vpx_fdct32x32_msa(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct32x32 vpx_fdct32x32_msa +#define vpx_fdct32x32 vpx_fdct32x32_c void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); -void vpx_fdct32x32_1_msa(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct32x32_1 vpx_fdct32x32_1_msa +#define vpx_fdct32x32_1 vpx_fdct32x32_1_c void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); -void vpx_fdct32x32_rd_msa(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct32x32_rd vpx_fdct32x32_rd_msa +#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); -void vpx_fdct4x4_msa(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct4x4 vpx_fdct4x4_msa +#define vpx_fdct4x4 vpx_fdct4x4_c void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); #define vpx_fdct4x4_1 vpx_fdct4x4_1_c void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); -void vpx_fdct8x8_msa(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct8x8 vpx_fdct8x8_msa +#define vpx_fdct8x8 vpx_fdct8x8_c void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct8x8_1_msa(const int16_t *input, tran_low_t *output, int stride); @@ -271,68 +266,881 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vpx_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_msa -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_16x16_msa(const int16_t *src_diff, int src_stride, int16_t *coeff); -#define vpx_hadamard_16x16 vpx_hadamard_16x16_msa +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +#define vpx_hadamard_16x16 vpx_hadamard_16x16_c -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_8x8_msa(const int16_t *src_diff, int src_stride, int16_t *coeff); -#define vpx_hadamard_8x8 vpx_hadamard_8x8_msa +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +#define vpx_hadamard_8x8 vpx_hadamard_8x8_c void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c +void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c + +void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c + +unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c + +unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c + +unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c + +unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c + +unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c + +unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c + +unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c + +unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c + +unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c + +unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c + +unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c + +unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c + +unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c + +unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c + +unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c + +unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c + +void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c + +void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c + +unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c + +unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c + +unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c + +unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c + +unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c + +unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c + +unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c + +unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c + +unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c + +unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c + +unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c + +unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c + +unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c + +unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c + +unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c + +unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c + +void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c + +void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c + +unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c + +unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c + +unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c + +unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c + +unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c + +unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c + +unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c + +unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c + +unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c + +unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c + +unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c + +unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c + +unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c + +unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c + +unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c + +unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c + +unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p); +#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c + +unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p); +#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c + +void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride); +#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c + +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8 vpx_highbd_convolve8_c + +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c + +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c + +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c + +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c + +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c + +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c + +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c + +void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c + +void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c + +void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c + +void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c + +void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c + +void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c + +void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c + +void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c + +void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c + +void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c + +void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c + +void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c + +void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c + +void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c + +void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c + +void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c + +void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c + +void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c + +void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c + +void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c + +void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c + +void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c + +void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c + +void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c + +void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c + +void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c + +void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c + +void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c + +void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c + +void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c + +void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c + +void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c + +void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c + +void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c + +void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c + +void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c + +void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c + +void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c + +void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c + +void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c + +void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c + +void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c + +void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c + +void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c + +void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c + +void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c + +void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c + +void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c + +void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c + +void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c + +void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c + +void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c + +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c + +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c + +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c + +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c + +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c + +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c + +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c + +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c + +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c + +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c + +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c + +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c + +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c + +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c + +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c + +void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c + +void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c + +void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c + +void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c + +void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c + +void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c + +void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c + +void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c + +void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c + +void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c + +void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c + +void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c + +void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c + +void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c + +void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c + +unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c + +unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c + +void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c + +unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c + +unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c + +void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c + +unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c + +unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c + +void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c + +unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c + +unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c + +void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c + +unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c + +unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c + +void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c + +unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c + +unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c + +void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c + +unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c + +unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c + +void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c + +unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c + +unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c + +void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c + +unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c + +unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c + +void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c + +unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c + +unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c + +void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c + +unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c + +unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c + +void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c + +unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c + +unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c + +void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c + +unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c + +unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c + +void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c + +void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd); +#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c + +void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c + +void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c + +void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c + +void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c + +void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c + +void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c + +void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c + +void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c + void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_10_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct16x16_10_add vpx_idct16x16_10_add_msa +#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct16x16_1_add vpx_idct16x16_1_add_msa +#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_256_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct16x16_256_add vpx_idct16x16_256_add_msa +#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_256_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct16x16_38_add vpx_idct16x16_256_add_msa +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_c void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_msa +#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_msa +#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct32x32_1_add vpx_idct32x32_1_add_msa +#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_34_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct32x32_34_add vpx_idct32x32_34_add_msa +#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct4x4_16_add vpx_idct4x4_16_add_msa +#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct4x4_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct4x4_1_add vpx_idct4x4_1_add_msa +#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_12_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct8x8_12_add vpx_idct8x8_12_add_msa +#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct8x8_1_add vpx_idct8x8_1_add_msa +#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_64_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct8x8_64_add vpx_idct8x8_64_add_msa +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width); @@ -343,12 +1151,10 @@ void vpx_int_pro_row_msa(int16_t *hbuf, const uint8_t *ref, const int ref_stride #define vpx_int_pro_row vpx_int_pro_row_msa void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_iwht4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_msa +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_iwht4x4_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_msa +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); void vpx_lpf_horizontal_16_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); @@ -496,18 +1302,10 @@ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad32x32_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x32_avg vpx_sad32x32_avg_msa -void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad32x32x3_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x3 vpx_sad32x32x3_msa - void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); void vpx_sad32x32x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad32x32x4d vpx_sad32x32x4d_msa -void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad32x32x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x8 vpx_sad32x32x8_msa - unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x64_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad32x64 vpx_sad32x64_msa @@ -552,10 +1350,6 @@ void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad4x8x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad4x8x4d vpx_sad4x8x4d_msa -void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad4x8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x8 vpx_sad4x8x8_msa - unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad64x32 vpx_sad64x32_msa @@ -576,18 +1370,10 @@ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad64x64_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x64_avg vpx_sad64x64_avg_msa -void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad64x64x3_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x3 vpx_sad64x64x3_msa - void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); void vpx_sad64x64x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad64x64x4d vpx_sad64x64x4d_msa -void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad64x64x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x8 vpx_sad64x64x8_msa - unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x16 vpx_sad8x16_msa @@ -620,10 +1406,6 @@ void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad8x4x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad8x4x4d vpx_sad8x4x4d_msa -void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x4x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x8 vpx_sad8x4x8_msa - unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x8 vpx_sad8x8_msa @@ -644,26 +1426,26 @@ void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p void vpx_sad8x8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x8 vpx_sad8x8x8_msa -int vpx_satd_c(const int16_t *coeff, int length); -int vpx_satd_msa(const int16_t *coeff, int length); -#define vpx_satd vpx_satd_msa +int vpx_satd_c(const tran_low_t *coeff, int length); +#define vpx_satd vpx_satd_c -void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vpx_scaled_2d vpx_scaled_2d_c +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_2d vpx_scaled_2d_msa -void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c -void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c -void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c -void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_horiz vpx_scaled_horiz_c -void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); diff --git a/config/mips64-msa/vpx_scale_rtcd.h b/config/mips64-msa/vpx_scale_rtcd.h index ea70efc9d..eb6c009e1 100644 --- a/config/mips64-msa/vpx_scale_rtcd.h +++ b/config/mips64-msa/vpx_scale_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_SCALE_RTCD_H_ #define VPX_SCALE_RTCD_H_ @@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vpx_yv12_copy_y vpx_yv12_copy_y_c diff --git a/config/mips64-msa/vpx_version.h b/config/mips64-msa/vpx_version.h index 24da169b4..6078bae90 100644 --- a/config/mips64-msa/vpx_version.h +++ b/config/mips64-msa/vpx_version.h @@ -1,7 +1,8 @@ +// This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 6 -#define VERSION_PATCH 1 +#define VERSION_MINOR 7 +#define VERSION_PATCH 0 #define VERSION_EXTRA "" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.6.1" -#define VERSION_STRING " v1.6.1" +#define VERSION_STRING_NOSP "v1.7.0" +#define VERSION_STRING " v1.7.0" diff --git a/config/mips64/vp8_rtcd.h b/config/mips64/vp8_rtcd.h index 21dfa5a25..fbd444b8a 100644 --- a/config/mips64/vp8_rtcd.h +++ b/config/mips64/vp8_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP8_RTCD_H_ #define VP8_RTCD_H_ diff --git a/config/mips64/vp9_rtcd.h b/config/mips64/vp9_rtcd.h index c17a21721..91d3a1aab 100644 --- a/config/mips64/vp9_rtcd.h +++ b/config/mips64/vp9_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP9_RTCD_H_ #define VP9_RTCD_H_ @@ -33,7 +34,7 @@ extern "C" { int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_c -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); #define vp9_block_error_fp vp9_block_error_fp_c int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); @@ -51,12 +52,42 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); #define vp9_fht8x8 vp9_fht8x8_c -int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -#define vp9_full_search_sad vp9_full_search_sad_c - void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_c +int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd); +#define vp9_highbd_block_error vp9_highbd_block_error_c + +void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c + +void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c + +void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c + +void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c + +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd); +#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c + +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c + +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c + +void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c + +void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c + +void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c + void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c @@ -75,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); -#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c - void vp9_rtcd(void); #include "vpx_config.h" diff --git a/config/mips64/vpx_config.c b/config/mips64/vpx_config.c index f7ec4e6d8..5eb7dff03 100644 --- a/config/mips64/vpx_config.c +++ b/config/mips64/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=mips64-linux-gcc --disable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072"; +static const char* const cfg = "--target=mips64-linux-gcc --disable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/config/mips64/vpx_config.h b/config/mips64/vpx_config.h index 9efd808ed..8e67ca3a7 100644 --- a/config/mips64/vpx_config.h +++ b/config/mips64/vpx_config.h @@ -29,7 +29,9 @@ #define HAVE_SSE4_1 0 #define HAVE_AVX 0 #define HAVE_AVX2 0 +#define HAVE_AVX512 0 #define HAVE_VSX 0 +#define HAVE_MMI 0 #define HAVE_VPX_PORTS 1 #define HAVE_PTHREAD_H 1 #define HAVE_UNISTD_H 1 @@ -83,10 +85,11 @@ #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 -#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_VP9_HIGHBITDEPTH 1 #define CONFIG_BETTER_HW_COMPATIBILITY 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 #define CONFIG_SPATIAL_SVC 0 #define CONFIG_FP_MB_STATS 0 #define CONFIG_EMULATE_HARDWARE 0 diff --git a/config/mips64/vpx_dsp_rtcd.h b/config/mips64/vpx_dsp_rtcd.h index 1b15aadba..fbb38953d 100644 --- a/config/mips64/vpx_dsp_rtcd.h +++ b/config/mips64/vpx_dsp_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_DSP_RTCD_H_ #define VPX_DSP_RTCD_H_ @@ -13,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #ifdef __cplusplus @@ -28,28 +30,28 @@ unsigned int vpx_avg_8x8_c(const uint8_t *, int p); void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); #define vpx_comp_avg_pred vpx_comp_avg_pred_c -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8 vpx_convolve8_c -void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg vpx_convolve8_avg_c -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_horiz vpx_convolve8_horiz_c -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_vert vpx_convolve8_vert_c -void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_avg vpx_convolve_avg_c -void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_copy vpx_convolve_copy_c void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -229,15 +231,843 @@ void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); #define vpx_hadamard_16x16 vpx_hadamard_16x16_c -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); #define vpx_hadamard_8x8 vpx_hadamard_8x8_c void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c +void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c + +void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c + +unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c + +unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c + +unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c + +unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c + +unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c + +unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c + +unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c + +unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c + +unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c + +unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c + +unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c + +unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c + +unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c + +unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c + +unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c + +unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c + +void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c + +void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c + +unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c + +unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c + +unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c + +unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c + +unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c + +unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c + +unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c + +unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c + +unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c + +unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c + +unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c + +unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c + +unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c + +unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c + +unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c + +unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c + +void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c + +void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c + +unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c + +unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c + +unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c + +unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c + +uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c + +uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c + +uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c + +unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c + +unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c + +unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c + +unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c + +unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c + +unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c + +unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c + +unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c + +unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c + +unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c + +unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c + +unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c + +unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c + +unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p); +#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c + +unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p); +#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c + +void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride); +#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c + +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8 vpx_highbd_convolve8_c + +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c + +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c + +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c + +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c + +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c + +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c + +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c + +void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c + +void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c + +void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c + +void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c + +void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c + +void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c + +void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c + +void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c + +void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c + +void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c + +void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c + +void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c + +void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c + +void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c + +void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c + +void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c + +void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c + +void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c + +void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c + +void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c + +void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c + +void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c + +void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c + +void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c + +void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c + +void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c + +void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c + +void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c + +void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c + +void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c + +void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c + +void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c + +void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c + +void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c + +void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c + +void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c + +void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c + +void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c + +void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c + +void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c + +void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c + +void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c + +void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c + +void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c + +void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c + +void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c + +void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c + +void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c + +void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c + +void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c + +void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c + +void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c + +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c + +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c + +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c + +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c + +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c + +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c + +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c + +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c + +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c + +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c + +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c + +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c + +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c + +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c + +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c + +void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c + +void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c + +void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c + +void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c + +void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c + +void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c + +void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c + +void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c + +void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c + +void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c + +void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c + +void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c + +void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c + +void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c + +void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c + +unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c + +unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c + +void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c + +unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c + +unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c + +void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c + +unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c + +unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c + +void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c + +unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c + +unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c + +void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c + +unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c + +unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c + +void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c + +unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c + +unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c + +void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c + +unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c + +unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c + +void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c + +unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c + +unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c + +void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c + +unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c + +unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c + +void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c + +unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c + +unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c + +void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c + +unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c + +unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c + +void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c + +unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c + +unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c + +void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c + +unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c + +unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c + +void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c + +void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd); +#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c + +void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c + +void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c + +void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c + +void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c + +void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c + +void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c + +void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c + +void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c + void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c @@ -400,15 +1230,9 @@ unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x32_avg vpx_sad32x32_avg_c -void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x3 vpx_sad32x32x3_c - void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad32x32x4d vpx_sad32x32x4d_c -void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x8 vpx_sad32x32x8_c - unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad32x64 vpx_sad32x64_c @@ -442,9 +1266,6 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad4x8x4d vpx_sad4x8x4d_c -void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x8 vpx_sad4x8x8_c - unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad64x32 vpx_sad64x32_c @@ -460,15 +1281,9 @@ unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x64_avg vpx_sad64x64_avg_c -void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x3 vpx_sad64x64x3_c - void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad64x64x4d vpx_sad64x64x4d_c -void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x8 vpx_sad64x64x8_c - unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x16 vpx_sad8x16_c @@ -493,9 +1308,6 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad8x4x4d vpx_sad8x4x4d_c -void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x8 vpx_sad8x4x8_c - unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x8 vpx_sad8x8_c @@ -511,25 +1323,25 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x8 vpx_sad8x8x8_c -int vpx_satd_c(const int16_t *coeff, int length); +int vpx_satd_c(const tran_low_t *coeff, int length); #define vpx_satd vpx_satd_c -void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_2d vpx_scaled_2d_c -void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c -void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c -void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c -void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_horiz vpx_scaled_horiz_c -void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); diff --git a/config/mips64/vpx_scale_rtcd.h b/config/mips64/vpx_scale_rtcd.h index ea70efc9d..eb6c009e1 100644 --- a/config/mips64/vpx_scale_rtcd.h +++ b/config/mips64/vpx_scale_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_SCALE_RTCD_H_ #define VPX_SCALE_RTCD_H_ @@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vpx_yv12_copy_y vpx_yv12_copy_y_c diff --git a/config/mips64/vpx_version.h b/config/mips64/vpx_version.h index 24da169b4..6078bae90 100644 --- a/config/mips64/vpx_version.h +++ b/config/mips64/vpx_version.h @@ -1,7 +1,8 @@ +// This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 6 -#define VERSION_PATCH 1 +#define VERSION_MINOR 7 +#define VERSION_PATCH 0 #define VERSION_EXTRA "" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.6.1" -#define VERSION_STRING " v1.6.1" +#define VERSION_STRING_NOSP "v1.7.0" +#define VERSION_STRING " v1.7.0" diff --git a/config/x86/vp8_rtcd.h b/config/x86/vp8_rtcd.h index 77479a23b..3afbea668 100644 --- a/config/x86/vp8_rtcd.h +++ b/config/x86/vp8_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP8_RTCD_H_ #define VP8_RTCD_H_ diff --git a/config/x86/vp9_rtcd.h b/config/x86/vp9_rtcd.h index b19d6cc5a..49e9885aa 100644 --- a/config/x86/vp9_rtcd.h +++ b/config/x86/vp9_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP9_RTCD_H_ #define VP9_RTCD_H_ @@ -34,15 +35,14 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, in int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_sse2 -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); -int64_t vp9_block_error_fp_sse2(const int16_t *coeff, const int16_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); #define vp9_block_error_fp vp9_block_error_fp_sse2 int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); #define vp9_diamond_search_sad vp9_diamond_search_sad_c void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_fdct8x8_quant vp9_fdct8x8_quant_ssse3 @@ -58,14 +58,44 @@ void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_t void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); #define vp9_fht8x8 vp9_fht8x8_sse2 -int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -#define vp9_full_search_sad vp9_full_search_sadx3 - void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_sse2 +int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd); +int64_t vp9_highbd_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd); +#define vp9_highbd_block_error vp9_highbd_block_error_sse2 + +void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c + +void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c + +void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c + +void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c + +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd); +#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c + +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c + +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c + +void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c + +void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c + +void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c + void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2 @@ -89,9 +119,6 @@ void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct y void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_ssse3 -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); -#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c - void vp9_rtcd(void); #ifdef RTCD_C diff --git a/config/x86/vpx_config.asm b/config/x86/vpx_config.asm index e9ff1c42f..d9848bace 100644 --- a/config/x86/vpx_config.asm +++ b/config/x86/vpx_config.asm @@ -17,7 +17,9 @@ %define HAVE_SSE4_1 0 %define HAVE_AVX 0 %define HAVE_AVX2 0 +%define HAVE_AVX512 0 %define HAVE_VSX 0 +%define HAVE_MMI 0 %define HAVE_VPX_PORTS 1 %define HAVE_PTHREAD_H 1 %define HAVE_UNISTD_H 1 @@ -71,10 +73,11 @@ %define CONFIG_TEMPORAL_DENOISING 1 %define CONFIG_VP9_TEMPORAL_DENOISING 0 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 -%define CONFIG_VP9_HIGHBITDEPTH 0 +%define CONFIG_VP9_HIGHBITDEPTH 1 %define CONFIG_BETTER_HW_COMPATIBILITY 0 %define CONFIG_EXPERIMENTAL 0 %define CONFIG_SIZE_LIMIT 1 +%define CONFIG_ALWAYS_ADJUST_BPM 0 %define CONFIG_SPATIAL_SVC 0 %define CONFIG_FP_MB_STATS 0 %define CONFIG_EMULATE_HARDWARE 0 diff --git a/config/x86/vpx_config.c b/config/x86/vpx_config.c index 77a386493..2d3f0f735 100644 --- a/config/x86/vpx_config.c +++ b/config/x86/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=x86-linux-gcc --disable-sse4_1 --disable-avx --disable-avx2 --as=yasm --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072"; +static const char* const cfg = "--target=x86-linux-gcc --disable-sse4_1 --disable-avx --disable-avx2 --disable-avx512 --as=yasm --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/config/x86/vpx_config.h b/config/x86/vpx_config.h index 11a5e94ad..5b0fed08f 100644 --- a/config/x86/vpx_config.h +++ b/config/x86/vpx_config.h @@ -29,7 +29,9 @@ #define HAVE_SSE4_1 0 #define HAVE_AVX 0 #define HAVE_AVX2 0 +#define HAVE_AVX512 0 #define HAVE_VSX 0 +#define HAVE_MMI 0 #define HAVE_VPX_PORTS 1 #define HAVE_PTHREAD_H 1 #define HAVE_UNISTD_H 1 @@ -83,10 +85,11 @@ #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 -#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_VP9_HIGHBITDEPTH 1 #define CONFIG_BETTER_HW_COMPATIBILITY 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 #define CONFIG_SPATIAL_SVC 0 #define CONFIG_FP_MB_STATS 0 #define CONFIG_EMULATE_HARDWARE 0 diff --git a/config/x86/vpx_dsp_rtcd.h b/config/x86/vpx_dsp_rtcd.h index adc43df7b..69f2b43b5 100644 --- a/config/x86/vpx_dsp_rtcd.h +++ b/config/x86/vpx_dsp_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_DSP_RTCD_H_ #define VPX_DSP_RTCD_H_ @@ -13,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #ifdef __cplusplus @@ -31,42 +33,42 @@ void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); #define vpx_comp_avg_pred vpx_comp_avg_pred_sse2 -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8 vpx_convolve8_ssse3 -void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg vpx_convolve8_avg_ssse3 -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_ssse3 -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_ssse3 -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_horiz vpx_convolve8_horiz_ssse3 -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_vert vpx_convolve8_vert_ssse3 -void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_avg vpx_convolve_avg_sse2 -void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_copy vpx_convolve_copy_sse2 void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -294,17 +296,1068 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_sse2 -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_16x16_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); #define vpx_hadamard_16x16 vpx_hadamard_16x16_sse2 -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_8x8_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); #define vpx_hadamard_8x8 vpx_hadamard_8x8_sse2 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c +void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c + +void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c + +unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_sse2 + +unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c + +unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c + +unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_sse2 + +unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_10_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_sse2 + +unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_10_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_sse2 + +unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_10_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_sse2 + +unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_10_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_sse2 + +unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_10_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_sse2 + +unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_10_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_sse2 + +unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c + +unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c + +unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_10_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_sse2 + +unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_10_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_sse2 + +unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_10_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_sse2 + +unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c + +unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_10_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_sse2 + +void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c + +void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c + +unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_sse2 + +unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c + +unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c + +unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_sse2 + +unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_12_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_sse2 + +unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_12_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_sse2 + +unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_12_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_sse2 + +unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_12_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_sse2 + +unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_12_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_sse2 + +unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_12_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_sse2 + +unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c + +unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c + +unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_12_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_sse2 + +unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_12_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_sse2 + +unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_12_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_sse2 + +unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c + +unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_12_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_sse2 + +void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c + +void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c + +unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_sse2 + +unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c + +unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c + +unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_sse2 + +unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_8_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_sse2 + +unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_8_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_sse2 + +unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_8_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_sse2 + +unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_8_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_sse2 + +unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_8_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_sse2 + +unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_8_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_sse2 + +unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c + +unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c + +unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_8_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_sse2 + +unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_8_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_sse2 + +unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_8_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_sse2 + +unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c + +unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_8_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_sse2 + +unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p); +#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c + +unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p); +#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c + +void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride); +#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c + +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8 vpx_highbd_convolve8_c + +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c + +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c + +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c + +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c + +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c + +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_sse2 + +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_sse2 + +void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_ssse3 + +void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_ssse3 + +void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_sse2 + +void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_ssse3 + +void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_ssse3 + +void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_ssse3 + +void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_sse2 + +void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_ssse3 + +void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_ssse3 + +void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_ssse3 + +void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_sse2 + +void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_ssse3 + +void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_ssse3 + +void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_ssse3 + +void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d207_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_sse2 + +void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_ssse3 + +void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_ssse3 + +void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_ssse3 + +void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_ssse3 + +void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_ssse3 + +void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_ssse3 + +void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_ssse3 + +void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d63_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_sse2 + +void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_ssse3 + +void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_sse2 + +void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_sse2 + +void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_sse2 + +void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_sse2 + +void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_sse2 + +void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_sse2 + +void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_sse2 + +void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_sse2 + +void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_sse2 + +void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_sse2 + +void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_sse2 + +void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_sse2 + +void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_sse2 + +void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_sse2 + +void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_sse2 + +void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_sse2 + +void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_highbd_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_sse2 + +void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c + +void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_highbd_fdct32x32_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_sse2 + +void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c + +void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_highbd_fdct32x32_rd_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_sse2 + +void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_highbd_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_sse2 + +void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_highbd_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_sse2 + +void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c + +void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_sse2 + +void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_sse2 + +void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_sse2 + +void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_sse2 + +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2 + +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_sse2 + +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2 + +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_sse2 + +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1024_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_sse2 + +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_135_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_sse2 + +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2 + +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_34_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_sse2 + +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2 + +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_sse2 + +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2 + +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_sse2 + +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2 + +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c + +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c + +void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_sse2 + +void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_sse2 + +void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_sse2 + +void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +void vpx_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_sse2 + +void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_sse2 + +void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_sse2 + +void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_sse2 + +void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_sse2 + +void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_sse2 + +void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +void vpx_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_sse2 + +void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_sse2 + +void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_sse2 + +void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c + +void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b vpx_highbd_quantize_b_sse2 + +void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_sse2 + +unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_sse2 + +unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_sse2 + +void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_sse2 + +unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_sse2 + +unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_sse2 + +void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_sse2 + +unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_sse2 + +unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_sse2 + +void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_sse2 + +unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_sse2 + +unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_sse2 + +void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_sse2 + +unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_sse2 + +unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_sse2 + +void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_sse2 + +unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_sse2 + +unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_sse2 + +void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_sse2 + +unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c + +unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c + +void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_sse2 + +unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c + +unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c + +void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_sse2 + +unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_sse2 + +unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_sse2 + +void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_sse2 + +unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_sse2 + +unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_sse2 + +void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_sse2 + +unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_sse2 + +unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_sse2 + +void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_sse2 + +unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_sse2 + +unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_sse2 + +void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_sse2 + +unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_sse2 + +unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_sse2 + +void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_sse2 + +void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd); +#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c + +void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_tm_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_sse2 + +void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_tm_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_sse2 + +void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_tm_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_sse2 + +void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_tm_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_sse2 + +void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_v_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_sse2 + +void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_v_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_sse2 + +void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_v_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_sse2 + +void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_sse2 + void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2 @@ -318,16 +1371,15 @@ void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stri #define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2 void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct16x16_38_add vpx_idct16x16_256_add_sse2 +void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_sse2 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_ssse3 +#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_sse2 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_135_add vpx_idct32x32_135_add_ssse3 @@ -359,8 +1411,7 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct8x8_64_add vpx_idct8x8_64_add_ssse3 +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_sse2 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width); @@ -463,10 +1514,12 @@ void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vpx_quantize_b vpx_quantize_b_sse2 +void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_quantize_b vpx_quantize_b_ssse3 void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_ssse3 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -540,16 +1593,10 @@ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x32_avg vpx_sad32x32_avg_sse2 -void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x3 vpx_sad32x32x3_c - void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad32x32x4d vpx_sad32x32x4d_sse2 -void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x8 vpx_sad32x32x8_c - unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad32x64 vpx_sad32x64_sse2 @@ -593,9 +1640,6 @@ void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad4x8x4d vpx_sad4x8x4d_sse2 -void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x8 vpx_sad4x8x8_c - unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad64x32 vpx_sad64x32_sse2 @@ -616,16 +1660,10 @@ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x64_avg vpx_sad64x64_avg_sse2 -void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x3 vpx_sad64x64x3_c - void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad64x64x4d vpx_sad64x64x4d_sse2 -void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x8 vpx_sad64x64x8_c - unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x16 vpx_sad8x16_sse2 @@ -657,9 +1695,6 @@ void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad8x4x4d vpx_sad8x4x4d_sse2 -void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x8 vpx_sad8x4x8_c - unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x8 vpx_sad8x8_sse2 @@ -679,27 +1714,27 @@ void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x8 vpx_sad8x8x8_c -int vpx_satd_c(const int16_t *coeff, int length); -int vpx_satd_sse2(const int16_t *coeff, int length); +int vpx_satd_c(const tran_low_t *coeff, int length); +int vpx_satd_sse2(const tran_low_t *coeff, int length); #define vpx_satd vpx_satd_sse2 -void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_2d vpx_scaled_2d_ssse3 -void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c -void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c -void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c -void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_horiz vpx_scaled_horiz_c -void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); diff --git a/config/x86/vpx_scale_rtcd.h b/config/x86/vpx_scale_rtcd.h index ddf7d01cc..5f09104ea 100644 --- a/config/x86/vpx_scale_rtcd.h +++ b/config/x86/vpx_scale_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_SCALE_RTCD_H_ #define VPX_SCALE_RTCD_H_ @@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vpx_yv12_copy_y vpx_yv12_copy_y_c diff --git a/config/x86/vpx_version.h b/config/x86/vpx_version.h index 24da169b4..6078bae90 100644 --- a/config/x86/vpx_version.h +++ b/config/x86/vpx_version.h @@ -1,7 +1,8 @@ +// This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 6 -#define VERSION_PATCH 1 +#define VERSION_MINOR 7 +#define VERSION_PATCH 0 #define VERSION_EXTRA "" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.6.1" -#define VERSION_STRING " v1.6.1" +#define VERSION_STRING_NOSP "v1.7.0" +#define VERSION_STRING " v1.7.0" diff --git a/config/x86_64/vp8_rtcd.h b/config/x86_64/vp8_rtcd.h index 77479a23b..3afbea668 100644 --- a/config/x86_64/vp8_rtcd.h +++ b/config/x86_64/vp8_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP8_RTCD_H_ #define VP8_RTCD_H_ diff --git a/config/x86_64/vp9_rtcd.h b/config/x86_64/vp9_rtcd.h index f77b2a5c2..2a13e5d5c 100644 --- a/config/x86_64/vp9_rtcd.h +++ b/config/x86_64/vp9_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VP9_RTCD_H_ #define VP9_RTCD_H_ @@ -34,15 +35,14 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, in int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_sse2 -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); -int64_t vp9_block_error_fp_sse2(const int16_t *coeff, const int16_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); #define vp9_block_error_fp vp9_block_error_fp_sse2 int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); #define vp9_diamond_search_sad vp9_diamond_search_sad_c void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_fdct8x8_quant vp9_fdct8x8_quant_ssse3 @@ -58,14 +58,44 @@ void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_t void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); #define vp9_fht8x8 vp9_fht8x8_sse2 -int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -#define vp9_full_search_sad vp9_full_search_sadx3 - void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_sse2 +int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd); +int64_t vp9_highbd_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd); +#define vp9_highbd_block_error vp9_highbd_block_error_sse2 + +void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c + +void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c + +void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c + +void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c + +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd); +#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c + +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c + +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); +#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c + +void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c + +void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c + +void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c + void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2 @@ -91,9 +121,6 @@ void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct y void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_ssse3 -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); -#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c - void vp9_rtcd(void); #ifdef RTCD_C diff --git a/config/x86_64/vpx_config.asm b/config/x86_64/vpx_config.asm index 798a81a1c..747981e62 100644 --- a/config/x86_64/vpx_config.asm +++ b/config/x86_64/vpx_config.asm @@ -17,7 +17,9 @@ %define HAVE_SSE4_1 0 %define HAVE_AVX 0 %define HAVE_AVX2 0 +%define HAVE_AVX512 0 %define HAVE_VSX 0 +%define HAVE_MMI 0 %define HAVE_VPX_PORTS 1 %define HAVE_PTHREAD_H 1 %define HAVE_UNISTD_H 1 @@ -71,10 +73,11 @@ %define CONFIG_TEMPORAL_DENOISING 1 %define CONFIG_VP9_TEMPORAL_DENOISING 0 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 -%define CONFIG_VP9_HIGHBITDEPTH 0 +%define CONFIG_VP9_HIGHBITDEPTH 1 %define CONFIG_BETTER_HW_COMPATIBILITY 0 %define CONFIG_EXPERIMENTAL 0 %define CONFIG_SIZE_LIMIT 1 +%define CONFIG_ALWAYS_ADJUST_BPM 0 %define CONFIG_SPATIAL_SVC 0 %define CONFIG_FP_MB_STATS 0 %define CONFIG_EMULATE_HARDWARE 0 diff --git a/config/x86_64/vpx_config.c b/config/x86_64/vpx_config.c index 9aa0640aa..a13a1d2e2 100644 --- a/config/x86_64/vpx_config.c +++ b/config/x86_64/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=x86_64-linux-gcc --disable-sse4_1 --disable-avx --disable-avx2 --as=yasm --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072"; +static const char* const cfg = "--target=x86_64-linux-gcc --disable-sse4_1 --disable-avx --disable-avx2 --disable-avx512 --as=yasm --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/config/x86_64/vpx_config.h b/config/x86_64/vpx_config.h index d24e047ba..75d7e9900 100644 --- a/config/x86_64/vpx_config.h +++ b/config/x86_64/vpx_config.h @@ -29,7 +29,9 @@ #define HAVE_SSE4_1 0 #define HAVE_AVX 0 #define HAVE_AVX2 0 +#define HAVE_AVX512 0 #define HAVE_VSX 0 +#define HAVE_MMI 0 #define HAVE_VPX_PORTS 1 #define HAVE_PTHREAD_H 1 #define HAVE_UNISTD_H 1 @@ -83,10 +85,11 @@ #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 -#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_VP9_HIGHBITDEPTH 1 #define CONFIG_BETTER_HW_COMPATIBILITY 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 #define CONFIG_SPATIAL_SVC 0 #define CONFIG_FP_MB_STATS 0 #define CONFIG_EMULATE_HARDWARE 0 diff --git a/config/x86_64/vpx_dsp_rtcd.h b/config/x86_64/vpx_dsp_rtcd.h index 4e55439ca..a382a5a05 100644 --- a/config/x86_64/vpx_dsp_rtcd.h +++ b/config/x86_64/vpx_dsp_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_DSP_RTCD_H_ #define VPX_DSP_RTCD_H_ @@ -13,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #ifdef __cplusplus @@ -31,42 +33,42 @@ void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); #define vpx_comp_avg_pred vpx_comp_avg_pred_sse2 -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8 vpx_convolve8_ssse3 -void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg vpx_convolve8_avg_ssse3 -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_ssse3 -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_ssse3 -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_horiz vpx_convolve8_horiz_ssse3 -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_vert vpx_convolve8_vert_ssse3 -void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_avg vpx_convolve_avg_sse2 -void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_copy vpx_convolve_copy_sse2 void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -257,8 +259,7 @@ void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride); -void vpx_fdct8x8_ssse3(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct8x8 vpx_fdct8x8_ssse3 +#define vpx_fdct8x8 vpx_fdct8x8_sse2 void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride); @@ -295,18 +296,1075 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_sse2 -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_16x16_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); #define vpx_hadamard_16x16 vpx_hadamard_16x16_sse2 -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_8x8_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); #define vpx_hadamard_8x8 vpx_hadamard_8x8_ssse3 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c +void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c + +void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c + +unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_sse2 + +unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c + +unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c + +unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_sse2 + +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_sse2 + +uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_10_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_sse2 + +unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_10_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_sse2 + +unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_10_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_sse2 + +unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_10_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_sse2 + +unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_10_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_sse2 + +unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_10_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_sse2 + +unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_10_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_sse2 + +unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c + +unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c + +unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_10_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_sse2 + +unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_10_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_sse2 + +unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_10_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_sse2 + +unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c + +unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_10_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_sse2 + +void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c + +void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c + +unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_sse2 + +unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c + +unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c + +unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_sse2 + +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_sse2 + +uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_12_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_sse2 + +unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_12_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_sse2 + +unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_12_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_sse2 + +unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_12_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_sse2 + +unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_12_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_sse2 + +unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_12_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_sse2 + +unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_12_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_sse2 + +unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c + +unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c + +unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_12_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_sse2 + +unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_12_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_sse2 + +unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_12_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_sse2 + +unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c + +unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_12_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_sse2 + +void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c + +void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c + +unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_sse2 + +unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c + +unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c + +unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_sse2 + +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c + +uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c + +uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_sse2 + +uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_highbd_8_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_sse2 + +unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_8_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_sse2 + +unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_8_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_sse2 + +unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_8_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_sse2 + +unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_8_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_sse2 + +unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_8_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_sse2 + +unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_8_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_sse2 + +unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c + +unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c + +unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_8_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_sse2 + +unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_8_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_sse2 + +unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_8_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_sse2 + +unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c + +unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_highbd_8_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_sse2 + +unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p); +#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c + +unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p); +#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c + +void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride); +#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c + +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8 vpx_highbd_convolve8_sse2 + +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_sse2 + +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_horiz_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_sse2 + +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_vert_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_sse2 + +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_horiz_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_sse2 + +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_vert_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_sse2 + +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_sse2 + +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps); +#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_sse2 + +void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_ssse3 + +void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_ssse3 + +void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_sse2 + +void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_ssse3 + +void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_ssse3 + +void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_ssse3 + +void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_sse2 + +void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_ssse3 + +void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_ssse3 + +void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_ssse3 + +void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_sse2 + +void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_ssse3 + +void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_ssse3 + +void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_ssse3 + +void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d207_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_sse2 + +void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_ssse3 + +void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_ssse3 + +void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_ssse3 + +void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_ssse3 + +void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_ssse3 + +void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_ssse3 + +void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_ssse3 + +void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d63_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_sse2 + +void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_ssse3 + +void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_sse2 + +void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_sse2 + +void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_sse2 + +void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_sse2 + +void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_sse2 + +void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_sse2 + +void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_sse2 + +void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_sse2 + +void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_sse2 + +void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_sse2 + +void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_sse2 + +void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_sse2 + +void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_sse2 + +void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_sse2 + +void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_sse2 + +void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_sse2 + +void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_highbd_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_sse2 + +void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c + +void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_highbd_fdct32x32_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_sse2 + +void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c + +void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_highbd_fdct32x32_rd_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_sse2 + +void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_highbd_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_sse2 + +void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_highbd_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_sse2 + +void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c + +void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_sse2 + +void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_sse2 + +void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_sse2 + +void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_sse2 + +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2 + +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_sse2 + +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2 + +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_sse2 + +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1024_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_sse2 + +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_135_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_sse2 + +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2 + +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_34_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_sse2 + +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2 + +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_sse2 + +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2 + +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_sse2 + +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2 + +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c + +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c + +void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_sse2 + +void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_sse2 + +void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_sse2 + +void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +void vpx_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_sse2 + +void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_sse2 + +void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_sse2 + +void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_sse2 + +void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_sse2 + +void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_sse2 + +void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +void vpx_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_sse2 + +void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); +#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_sse2 + +void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); +#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_sse2 + +void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c + +void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b vpx_highbd_quantize_b_sse2 + +void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_sse2 + +unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_sse2 + +unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_sse2 + +void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_sse2 + +unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_sse2 + +unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_sse2 + +void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_sse2 + +unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_sse2 + +unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_sse2 + +void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_sse2 + +unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_sse2 + +unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_sse2 + +void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_sse2 + +unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_sse2 + +unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_sse2 + +void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_sse2 + +unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_sse2 + +unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_sse2 + +void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_sse2 + +unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c + +unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c + +void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_sse2 + +unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c + +unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c + +void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_sse2 + +unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_sse2 + +unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_sse2 + +void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_sse2 + +unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_sse2 + +unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_sse2 + +void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_sse2 + +unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_sse2 + +unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_sse2 + +void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_sse2 + +unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_sse2 + +unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_sse2 + +void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_sse2 + +unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_highbd_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_sse2 + +unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_highbd_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_sse2 + +void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_sse2 + +void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd); +#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c + +void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_tm_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_sse2 + +void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_tm_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_sse2 + +void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_tm_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_sse2 + +void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_tm_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_sse2 + +void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_v_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_sse2 + +void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_v_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_sse2 + +void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_v_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_sse2 + +void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); +#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_sse2 + void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2 @@ -320,16 +1378,15 @@ void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stri #define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2 void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct16x16_38_add vpx_idct16x16_256_add_sse2 +void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_sse2 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_ssse3 +#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_sse2 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_135_add vpx_idct32x32_135_add_ssse3 @@ -361,8 +1418,7 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct8x8_64_add vpx_idct8x8_64_add_ssse3 +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_sse2 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width); @@ -544,16 +1600,10 @@ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x32_avg vpx_sad32x32_avg_sse2 -void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x3 vpx_sad32x32x3_c - void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad32x32x4d vpx_sad32x32x4d_sse2 -void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x8 vpx_sad32x32x8_c - unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad32x64 vpx_sad32x64_sse2 @@ -597,9 +1647,6 @@ void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad4x8x4d vpx_sad4x8x4d_sse2 -void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x8 vpx_sad4x8x8_c - unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad64x32 vpx_sad64x32_sse2 @@ -620,16 +1667,10 @@ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x64_avg vpx_sad64x64_avg_sse2 -void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x3 vpx_sad64x64x3_c - void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad64x64x4d vpx_sad64x64x4d_sse2 -void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x8 vpx_sad64x64x8_c - unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x16 vpx_sad8x16_sse2 @@ -661,9 +1702,6 @@ void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); #define vpx_sad8x4x4d vpx_sad8x4x4d_sse2 -void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x8 vpx_sad8x4x8_c - unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x8 vpx_sad8x8_sse2 @@ -683,27 +1721,27 @@ void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x8 vpx_sad8x8x8_c -int vpx_satd_c(const int16_t *coeff, int length); -int vpx_satd_sse2(const int16_t *coeff, int length); +int vpx_satd_c(const tran_low_t *coeff, int length); +int vpx_satd_sse2(const tran_low_t *coeff, int length); #define vpx_satd vpx_satd_sse2 -void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_2d vpx_scaled_2d_ssse3 -void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c -void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c -void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c -void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_horiz vpx_scaled_horiz_c -void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); diff --git a/config/x86_64/vpx_scale_rtcd.h b/config/x86_64/vpx_scale_rtcd.h index ddf7d01cc..5f09104ea 100644 --- a/config/x86_64/vpx_scale_rtcd.h +++ b/config/x86_64/vpx_scale_rtcd.h @@ -1,3 +1,4 @@ +// This file is generated. Do not edit. #ifndef VPX_SCALE_RTCD_H_ #define VPX_SCALE_RTCD_H_ @@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vpx_yv12_copy_y vpx_yv12_copy_y_c diff --git a/config/x86_64/vpx_version.h b/config/x86_64/vpx_version.h index 24da169b4..6078bae90 100644 --- a/config/x86_64/vpx_version.h +++ b/config/x86_64/vpx_version.h @@ -1,7 +1,8 @@ +// This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 6 -#define VERSION_PATCH 1 +#define VERSION_MINOR 7 +#define VERSION_PATCH 0 #define VERSION_EXTRA "" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.6.1" -#define VERSION_STRING " v1.6.1" +#define VERSION_STRING_NOSP "v1.7.0" +#define VERSION_STRING " v1.7.0" diff --git a/generate_config.sh b/generate_config.sh index 97e5a0026..351feb87c 100755 --- a/generate_config.sh +++ b/generate_config.sh @@ -139,23 +139,76 @@ function gen_config_files { rm -rf vpx_config.* vpx_version.h } +# Generate a text file containing sources for a config +# $1 - Config +function gen_source_list { + make_clean + if [[ "$1" = "mips"* ]] || [[ "$1" = "generic" ]]; then + config=$(print_config_basic $1) + else + config=$(print_config $1) + fi + make libvpx_srcs.txt target=libs $config > /dev/null + mv libvpx_srcs.txt libvpx_srcs_$1.txt +} + +# Extract a list of C sources from a libvpx_srcs.txt file +# $1 - path to libvpx_srcs.txt +function libvpx_srcs_txt_to_c_srcs { + grep ".c$" $1 | grep -v "^vpx_config.c$" | awk '$0="\"libvpx/"$0"\","' | sort +} + +# Extract a list of ASM sources from a libvpx_srcs.txt file +# $1 - path to libvpx_srcs.txt +function libvpx_srcs_txt_to_asm_srcs { + grep ".asm$" $1 | awk '$0="\"libvpx/"$0"\","' | sort +} + # Convert a list of sources to a blueprint file containing a variable # assignment. -# $1 - Variable name prefix. -# $2 - Input file. -# $3 - Config directory. +# $1 - Config function gen_bp_srcs { - echo "$1_c_srcs = [" - grep ".c$" $2 | grep -v "^vpx_config.c$" | awk '$0="\"libvpx/"$0"\","' - echo "\"$3/vpx_config.c\"," - echo "]" - if grep -q ".asm$" $2; then + ( + varprefix=libvpx_${1//-/_} + echo "${varprefix}_c_srcs = [" + libvpx_srcs_txt_to_c_srcs libvpx_srcs_$1.txt + echo "\"$LIBVPX_CONFIG_DIR/$1/vpx_config.c\"," + echo "]" + if grep -q ".asm$" libvpx_srcs_$1.txt; then + echo + echo "${varprefix}_asm_srcs = [" + libvpx_srcs_txt_to_asm_srcs libvpx_srcs_$1.txt + echo "]" + fi echo - echo "$1_asm_srcs = [" - grep ".asm$" $2 | awk '$0="\"libvpx/"$0"\","' + ) > config_$1.bp +} + +# Convert a list of sources to a blueprint file containing a variable +# assignment, relative to a reference config. +# $1 - Config +# $2 - Reference config +function gen_bp_srcs_with_excludes { + ( + varprefix=libvpx_${1//-/_} + echo "${varprefix}_c_srcs = [" + comm -23 <(libvpx_srcs_txt_to_c_srcs libvpx_srcs_$1.txt) <(libvpx_srcs_txt_to_c_srcs libvpx_srcs_$2.txt) + echo "\"$LIBVPX_CONFIG_DIR/$1/vpx_config.c\"," echo "]" - fi - echo + echo + echo "${varprefix}_exclude_c_srcs = [" + comm -13 <(libvpx_srcs_txt_to_c_srcs libvpx_srcs_$1.txt) <(libvpx_srcs_txt_to_c_srcs libvpx_srcs_$2.txt) + echo "\"$LIBVPX_CONFIG_DIR/$2/vpx_config.c\"," + echo "]" + echo + if grep -q ".asm$" libvpx_srcs_$1.txt; then + echo + echo "${varprefix}_asm_srcs = [" + libvpx_srcs_txt_to_asm_srcs libvpx_srcs_$1.txt + echo "]" + fi + echo + ) > config_$1.bp } echo "Create temporary directory." @@ -165,8 +218,10 @@ cp -R $LIBVPX_SRC_DIR $TEMP_DIR cd $TEMP_DIR echo "Generate config files." -all_platforms="--enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072" -intel="--disable-sse4_1 --disable-avx --disable-avx2 --as=yasm" +all_platforms="--enable-external-build --enable-realtime-only --enable-pic" +all_platforms+=" --disable-runtime-cpu-detect --disable-install-docs" +all_platforms+=" --size-limit=4096x3072 --enable-vp9-highbitdepth" +intel="--disable-sse4_1 --disable-avx --disable-avx2 --disable-avx512 --as=yasm" gen_config_files x86 "--target=x86-linux-gcc ${intel} ${all_platforms}" gen_config_files x86_64 "--target=x86_64-linux-gcc ${intel} ${all_platforms}" gen_config_files arm "--target=armv7-linux-gcc --disable-neon ${all_platforms}" @@ -218,71 +273,31 @@ echo "Prepare Makefile." ./configure --target=generic-gnu > /dev/null make_clean -echo "Generate X86 source list." -config=$(print_config x86) -make_clean -make libvpx_srcs.txt target=libs $config > /dev/null -gen_bp_srcs libvpx_x86 libvpx_srcs.txt $LIBVPX_CONFIG_DIR/x86 > config_x86.bp - -echo "Generate X86_64 source list." -config=$(print_config x86_64) -make_clean -make libvpx_srcs.txt target=libs $config > /dev/null -gen_bp_srcs libvpx_x86_64 libvpx_srcs.txt $LIBVPX_CONFIG_DIR/x86_64 > config_x86_64.bp - -echo "Generate ARM source list." -config=$(print_config arm) -make_clean -make libvpx_srcs.txt target=libs $config > /dev/null -gen_bp_srcs libvpx_arm libvpx_srcs.txt $LIBVPX_CONFIG_DIR/arm > config_arm.bp - -echo "Generate ARM NEON source list." -config=$(print_config arm-neon) -make_clean -make libvpx_srcs.txt target=libs $config > /dev/null -gen_bp_srcs libvpx_arm_neon libvpx_srcs.txt $LIBVPX_CONFIG_DIR/arm-neon > config_arm-neon.bp - -echo "Generate ARM64 source list." -config=$(print_config arm64) -make_clean -make libvpx_srcs.txt target=libs $config > /dev/null -gen_bp_srcs libvpx_arm64 libvpx_srcs.txt $LIBVPX_CONFIG_DIR/arm64 > config_arm64.bp - -echo "Generate MIPS source list." -config=$(print_config_basic mips32) -make_clean -make libvpx_srcs.txt target=libs $config > /dev/null -gen_bp_srcs libvpx_mips32 libvpx_srcs.txt $LIBVPX_CONFIG_DIR/mips32 > config_mips32.bp - -echo "Generate MIPS DSPR2 source list." -config=$(print_config_basic mips32-dspr2) -make_clean -make libvpx_srcs.txt target=libs $config > /dev/null -gen_bp_srcs libvpx_mips32_dspr2 libvpx_srcs.txt $LIBVPX_CONFIG_DIR/mips32-dspr2 > config_mips32-dispr2.bp - -echo "Generate MIPS MSA source list." -config=$(print_config_basic mips32-msa) -make_clean -make libvpx_srcs.txt target=libs $config > /dev/null -gen_bp_srcs libvpx_mips32_msa libvpx_srcs.txt $LIBVPX_CONFIG_DIR/mips32-msa > config_mips32-msa.bp - -echo "Generate MIPS64 source list." -config=$(print_config_basic mips64) -make_clean -make libvpx_srcs.txt target=libs $config > /dev/null -gen_bp_srcs libvpx_mips64 libvpx_srcs.txt $LIBVPX_CONFIG_DIR/mips64 > config_mips64.bp - -echo "Generate MIPS64 MSA source list." -config=$(print_config_basic mips64-msa) -make_clean -make libvpx_srcs.txt target=libs $config > /dev/null -gen_bp_srcs libvpx_mips64_msa libvpx_srcs.txt $LIBVPX_CONFIG_DIR/mips64-msa > config_mips64-msa.bp - -echo "Generate GENERIC source list." -config=$(print_config_basic generic) -make_clean -make libvpx_srcs.txt target=libs $config > /dev/null -gen_bp_srcs libvpx_generic libvpx_srcs.txt $LIBVPX_CONFIG_DIR/generic > config_generic.bp +echo "Generate source lists" +gen_source_list x86 +gen_source_list x86_64 +gen_source_list arm +gen_source_list arm-neon +gen_source_list arm64 +gen_source_list mips32 +gen_source_list mips32-dspr2 +gen_source_list mips32-msa +gen_source_list mips64 +gen_source_list mips64-msa +gen_source_list generic + +echo "Convert to bp" +gen_bp_srcs x86 +gen_bp_srcs x86_64 +gen_bp_srcs arm +gen_bp_srcs_with_excludes arm-neon arm +gen_bp_srcs arm64 +gen_bp_srcs mips32 +gen_bp_srcs_with_excludes mips32-dspr2 mips32 +gen_bp_srcs_with_excludes mips32-msa mips32 +gen_bp_srcs mips64 +gen_bp_srcs_with_excludes mips64-msa mips64 +gen_bp_srcs generic rm -f $BASE_DIR/Android.bp ( diff --git a/libvpx/.clang-format b/libvpx/.clang-format index 7837b7704..c1483199e 100644 --- a/libvpx/.clang-format +++ b/libvpx/.clang-format @@ -1,7 +1,7 @@ --- Language: Cpp # BasedOnStyle: Google -# Generated with clang-format 3.9.1 +# Generated with clang-format 4.0.1 AccessModifierOffset: -1 AlignAfterOpenBracket: Align AlignConsecutiveAssignments: false @@ -60,6 +60,8 @@ IncludeIsMainRegex: '([-_](test|unittest))?$' IndentCaseLabels: true IndentWidth: 2 IndentWrappedFunctionNames: false +JavaScriptQuotes: Leave +JavaScriptWrapImports: true KeepEmptyLinesAtTheStartOfBlocks: false MacroBlockBegin: '' MacroBlockEnd: '' @@ -78,6 +80,7 @@ PointerAlignment: Right ReflowComments: true SortIncludes: false SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true SpaceBeforeAssignmentOperators: true SpaceBeforeParens: ControlStatements SpaceInEmptyParentheses: false diff --git a/libvpx/.mailmap b/libvpx/.mailmap index 166c45ee8..29af51065 100644 --- a/libvpx/.mailmap +++ b/libvpx/.mailmap @@ -3,6 +3,7 @@ Aℓex Converse <aconverse@google.com> Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com> Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com> Alpha Lam <hclam@google.com> <hclam@chromium.org> +Chris Cunningham <chcunningham@chromium.org> Daniele Castagna <dcastagna@chromium.org> <dcastagna@google.com> Deb Mukherjee <debargha@google.com> Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com> @@ -21,18 +22,21 @@ Marco Paniconi <marpan@google.com> Marco Paniconi <marpan@google.com> <marpan@chromium.org> Pascal Massimino <pascal.massimino@gmail.com> Paul Wilkins <paulwilkins@google.com> +Peter Boström <pbos@chromium.org> <pbos@google.com> Peter de Rivaz <peter.derivaz@gmail.com> Peter de Rivaz <peter.derivaz@gmail.com> <peter.derivaz@argondesign.com> Ralph Giles <giles@xiph.org> <giles@entropywave.com> Ralph Giles <giles@xiph.org> <giles@mozilla.com> Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com> Sami Pietilä <samipietila@google.com> +Shiyou Yin <yinshiyou-hf@loongson.cn> Tamar Levy <tamar.levy@intel.com> Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com> Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com> Timothy B. Terriberry <tterribe@xiph.org> <tterriberry@mozilla.com> Tom Finegan <tomfinegan@google.com> Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org> +Urvang Joshi <urvang@google.com> <urvang@chromium.org> Yaowu Xu <yaowu@google.com> <adam@xuyaowu.com> Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com> Yaowu Xu <yaowu@google.com> <Yaowu Xu> diff --git a/libvpx/AUTHORS b/libvpx/AUTHORS index 87a5e845c..04c287243 100644 --- a/libvpx/AUTHORS +++ b/libvpx/AUTHORS @@ -3,13 +3,13 @@ Aaron Watry <awatry@gmail.com> Abo Talib Mahfoodh <ab.mahfoodh@gmail.com> -Adam Xu <adam@xuyaowu.com> Adrian Grange <agrange@google.com> Aℓex Converse <aconverse@google.com> Ahmad Sharif <asharif@google.com> Aleksey Vasenev <margtu-fivt@ya.ru> Alexander Potapenko <glider@google.com> Alexander Voronov <avoronov@graphics.cs.msu.ru> +Alexandra Hájková <alexandra.khirnova@gmail.com> Alexis Ballier <aballier@gentoo.org> Alok Ahuja <waveletcoeff@gmail.com> Alpha Lam <hclam@google.com> @@ -17,6 +17,7 @@ A.Mahfoodh <ab.mahfoodh@gmail.com> Ami Fischman <fischman@chromium.org> Andoni Morales Alastruey <ylatuya@gmail.com> Andres Mejia <mcitadel@gmail.com> +Andrew Lewis <andrewlewis@google.com> Andrew Russell <anrussell@google.com> Angie Chiang <angiebird@google.com> Aron Rosenberg <arosenberg@logitech.com> @@ -24,7 +25,9 @@ Attila Nagy <attilanagy@google.com> Brion Vibber <bvibber@wikimedia.org> changjun.yang <changjun.yang@intel.com> Charles 'Buck' Krasic <ckrasic@google.com> +Cheng Chen <chengchen@google.com> chm <chm@rock-chips.com> +Chris Cunningham <chcunningham@chromium.org> Christian Duvivier <cduvivier@google.com> Daniele Castagna <dcastagna@chromium.org> Daniel Kang <ddkang@google.com> @@ -46,10 +49,12 @@ Geza Lore <gezalore@gmail.com> Ghislain MARY <ghislainmary2@gmail.com> Giuseppe Scrivano <gscrivano@gnu.org> Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com> +Gregor Jasny <gjasny@gmail.com> Guillaume Martres <gmartres@google.com> Guillermo Ballester Valor <gbvalor@gmail.com> Hangyu Kuang <hkuang@google.com> Hanno Böck <hanno@hboeck.de> +Han Shen <shenhan@google.com> Henrik Lundin <hlundin@google.com> Hui Su <huisu@google.com> Ivan Krasin <krasin@chromium.org> @@ -83,6 +88,7 @@ Justin Clift <justin@salasaga.org> Justin Lebar <justin.lebar@gmail.com> Kaustubh Raste <kaustubh.raste@imgtec.com> KO Myung-Hun <komh@chollian.net> +Kyle Siefring <kylesiefring@gmail.com> Lawrence Velázquez <larryv@macports.org> Linfeng Zhang <linfengz@google.com> Lou Quillio <louquillio@google.com> @@ -101,6 +107,7 @@ Mikhal Shemer <mikhal@google.com> Min Chen <chenm003@gmail.com> Minghai Shang <minghai@google.com> Min Ye <yeemmi@google.com> +Moriyoshi Koizumi <mozo@mozo.jp> Morton Jonuschat <yabawock@gmail.com> Nathan E. Egge <negge@mozilla.com> Nico Weber <thakis@chromium.org> @@ -111,12 +118,15 @@ Paul Wilkins <paulwilkins@google.com> Pavol Rusnak <stick@gk2.sk> Paweł Hajdan <phajdan@google.com> Pengchong Jin <pengchong@google.com> -Peter Boström <pbos@google.com> +Peter Boström <pbos@chromium.org> +Peter Collingbourne <pcc@chromium.org> Peter de Rivaz <peter.derivaz@gmail.com> Philip Jägenstedt <philipj@opera.com> Priit Laes <plaes@plaes.org> Rafael Ávila de Espíndola <rafael.espindola@gmail.com> Rafaël Carré <funman@videolan.org> +Rafael de Lucena Valle <rafaeldelucena@gmail.com> +Rahul Chaudhry <rahulchaudhry@google.com> Ralph Giles <giles@xiph.org> Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com> Rob Bradford <rob@linux.intel.com> @@ -131,9 +141,11 @@ Sean McGovern <gseanmcg@gmail.com> Sergey Kolomenkin <kolomenkin@gmail.com> Sergey Ulanov <sergeyu@chromium.org> Shimon Doodkin <helpmepro1@gmail.com> +Shiyou Yin <yinshiyou-hf@loongson.cn> Shunyao Li <shunyaoli@google.com> Stefan Holmer <holmer@google.com> Suman Sunkara <sunkaras@google.com> +Sylvestre Ledru <sylvestre@mozilla.com> Taekhyun Kim <takim@nvidia.com> Takanori MATSUURA <t.matsuu@gmail.com> Tamar Levy <tamar.levy@intel.com> @@ -146,6 +158,7 @@ Tom Finegan <tomfinegan@google.com> Tristan Matthews <le.businessman@gmail.com> Urvang Joshi <urvang@google.com> Vignesh Venkatasubramanian <vigneshv@google.com> +Vlad Tsyrklevich <vtsyrklevich@chromium.org> Yaowu Xu <yaowu@google.com> Yi Luo <luoyi@google.com> Yongzhe Wang <yongzhe@google.com> diff --git a/libvpx/CHANGELOG b/libvpx/CHANGELOG index 7e7aec67a..2281394c8 100644 --- a/libvpx/CHANGELOG +++ b/libvpx/CHANGELOG @@ -1,3 +1,28 @@ +2017-01-04 v1.7.0 "Mandarin Duck" + This release focused on high bit depth performance (10/12 bit) and vp9 + encoding improvements. + + - Upgrading: + This release is ABI incompatible due to new vp9 encoder features. + + Frame parallel decoding for vp9 has been removed. + + - Enhancements: + vp9 encoding supports additional threads with --row-mt. This can be greater + than the number of tiles. + + Two new vp9 encoder options have been added: + --corpus-complexity + --tune-content=film + + Additional tooling for respecting the vp9 "level" profiles has been added. + + - Bug fixes: + A variety of fuzzing issues. + vp8 threading fix for ARM. + Codec control VP9_SET_SKIP_LOOP_FILTER fixed. + Reject invalid multi resolution configurations. + 2017-01-09 v1.6.1 "Long Tailed Duck" This release improves upon the VP9 encoder and speeds up the encoding and decoding processes. diff --git a/libvpx/README b/libvpx/README index f910ce761..73304dd62 100644 --- a/libvpx/README +++ b/libvpx/README @@ -1,4 +1,4 @@ -README - 26 January 2017 +README - 24 January 2018 Welcome to the WebM VP8/VP9 Codec SDK! @@ -63,6 +63,8 @@ COMPILING THE APPLICATIONS/LIBRARIES: armv8-linux-gcc mips32-linux-gcc mips64-linux-gcc + ppc64-linux-gcc + ppc64le-linux-gcc sparc-solaris-gcc x86-android-gcc x86-darwin8-gcc diff --git a/libvpx/build/make/Makefile b/libvpx/build/make/Makefile index 90522e5f6..f6b3f0630 100644 --- a/libvpx/build/make/Makefile +++ b/libvpx/build/make/Makefile @@ -139,6 +139,8 @@ $(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx $(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx $(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 $(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 +$(BUILD_PFX)%_avx512.c.d: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl +$(BUILD_PFX)%_avx512.c.o: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl # POWER $(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx diff --git a/libvpx/build/make/configure.sh b/libvpx/build/make/configure.sh index fbe8b1b45..683b43037 100755 --- a/libvpx/build/make/configure.sh +++ b/libvpx/build/make/configure.sh @@ -403,6 +403,23 @@ check_gcc_machine_option() { fi } +# tests for -m$2, -m$3, -m$4... toggling the feature given in $1. +check_gcc_machine_options() { + feature="$1" + shift + flags="-m$1" + shift + for opt in $*; do + flags="$flags -m$opt" + done + + if enabled gcc && ! disabled "$feature" && ! check_cflags $flags; then + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-$feature " + else + soft_enable "$feature" + fi +} + write_common_config_banner() { print_webm_license config.mk "##" "" echo '# This file automatically generated by configure. Do not edit!' >> config.mk @@ -702,6 +719,12 @@ process_common_toolchain() { power*) tgt_isa=ppc ;; + *mips64el*) + tgt_isa=mips64 + ;; + *mips32el*) + tgt_isa=mips32 + ;; esac # detect tgt_os @@ -1163,6 +1186,11 @@ EOF fi fi + if enabled mmi; then + tgt_isa=loongson3a + check_add_ldflags -march=loongson3a + fi + check_add_cflags -march=${tgt_isa} check_add_asflags -march=${tgt_isa} check_add_asflags -KPIC @@ -1227,6 +1255,13 @@ EOF msvs_arch_dir=x86-msvs vc_version=${tgt_cc##vs} case $vc_version in + 7|8|9|10|11|12|13|14) + echo "${tgt_cc} does not support avx512, disabling....." + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx512 " + soft_disable avx512 + ;; + esac + case $vc_version in 7|8|9|10) echo "${tgt_cc} does not support avx/avx2, disabling....." RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx --disable-avx2 " @@ -1270,9 +1305,18 @@ EOF elif disabled $ext; then disable_exts="yes" else - # use the shortened version for the flag: sse4_1 -> sse4 - check_gcc_machine_option ${ext%_*} $ext + if [ "$ext" = "avx512" ]; then + check_gcc_machine_options $ext avx512f avx512cd avx512bw avx512dq avx512vl + else + # use the shortened version for the flag: sse4_1 -> sse4 + check_gcc_machine_option ${ext%_*} $ext + fi fi + + # https://bugs.chromium.org/p/webm/issues/detail?id=1464 + # The assembly optimizations for vpx_sub_pixel_variance do not link with + # gcc 6. + enabled sse2 && soft_enable pic done if enabled external_build; then @@ -1297,7 +1341,6 @@ EOF esac log_echo " using $AS" fi - [ "${AS##*/}" = nasm ] && add_asflags -Ox AS_SFX=.asm case ${tgt_os} in win32) @@ -1306,7 +1349,7 @@ EOF EXE_SFX=.exe ;; win64) - add_asflags -f x64 + add_asflags -f win64 enabled debug && add_asflags -g cv8 EXE_SFX=.exe ;; @@ -1440,6 +1483,10 @@ EOF echo "msa optimizations are available only for little endian platforms" disable_feature msa fi + if enabled mmi; then + echo "mmi optimizations are available only for little endian platforms" + disable_feature mmi + fi fi ;; esac diff --git a/libvpx/build/make/gen_msvs_sln.sh b/libvpx/build/make/gen_msvs_sln.sh index 8b68038b3..401223a0b 100755 --- a/libvpx/build/make/gen_msvs_sln.sh +++ b/libvpx/build/make/gen_msvs_sln.sh @@ -240,10 +240,10 @@ case "${vs_ver:-10}" in 12) sln_vers="12.00" sln_vers_str="Visual Studio 2013" ;; - 14) sln_vers="14.00" + 14) sln_vers="12.00" sln_vers_str="Visual Studio 2015" ;; - 15) sln_vers="15.00" + 15) sln_vers="12.00" sln_vers_str="Visual Studio 2017" ;; esac diff --git a/libvpx/build/make/rtcd.pl b/libvpx/build/make/rtcd.pl index ce88e6480..68e92b52c 100755 --- a/libvpx/build/make/rtcd.pl +++ b/libvpx/build/make/rtcd.pl @@ -1,4 +1,13 @@ #!/usr/bin/env perl +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## no strict 'refs'; use warnings; @@ -200,6 +209,7 @@ sub filter { sub common_top() { my $include_guard = uc($opts{sym})."_H_"; print <<EOF; +// This file is generated. Do not edit. #ifndef ${include_guard} #define ${include_guard} @@ -391,10 +401,10 @@ EOF &require("c"); if ($opts{arch} eq 'x86') { - @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/); + @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2 avx512/); x86; } elsif ($opts{arch} eq 'x86_64') { - @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/); + @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2 avx512/); @REQUIRES = filter(keys %required ? keys %required : qw/mmx sse sse2/); &require(@REQUIRES); x86; @@ -411,6 +421,10 @@ if ($opts{arch} eq 'x86') { @ALL_ARCHS = filter("$opts{arch}", qw/msa/); last; } + if (/HAVE_MMI=yes/) { + @ALL_ARCHS = filter("$opts{arch}", qw/mmi/); + last; + } } close CONFIG_FILE; mips; diff --git a/libvpx/build/make/version.sh b/libvpx/build/make/version.sh index 696752777..f36ede10f 100755 --- a/libvpx/build/make/version.sh +++ b/libvpx/build/make/version.sh @@ -60,6 +60,7 @@ if [ ${bare} ]; then echo "${changelog_version}${git_version_id}" > $$.tmp else cat<<EOF>$$.tmp +// This file is generated. Do not edit. #define VERSION_MAJOR $major_version #define VERSION_MINOR $minor_version #define VERSION_PATCH $patch_version diff --git a/libvpx/configure b/libvpx/configure index 090d3fb1e..e5a74c6f2 100755 --- a/libvpx/configure +++ b/libvpx/configure @@ -170,11 +170,14 @@ for t in ${all_targets}; do [ -f "${source_path}/${t}.mk" ] && enable_feature ${t} done +if ! diff --version >/dev/null; then + die "diff missing: Try installing diffutils via your package manager." +fi + if ! perl --version >/dev/null; then die "Perl is required to build" fi - if [ "`cd \"${source_path}\" && pwd`" != "`pwd`" ]; then # test to see if source_path already configured if [ -f "${source_path}/vpx_config.h" ]; then @@ -241,7 +244,13 @@ ARCH_EXT_LIST_X86=" sse4_1 avx avx2 + avx512 " + +ARCH_EXT_LIST_LOONGSON=" + mmi +" + ARCH_EXT_LIST=" neon neon_asm @@ -254,6 +263,8 @@ ARCH_EXT_LIST=" ${ARCH_EXT_LIST_X86} vsx + + ${ARCH_EXT_LIST_LOONGSON} " HAVE_LIST=" ${ARCH_EXT_LIST} @@ -319,6 +330,7 @@ CONFIG_LIST=" better_hw_compatibility experimental size_limit + always_adjust_bpm ${EXPERIMENT_LIST} " CMDLINE_SELECT=" @@ -378,6 +390,7 @@ CMDLINE_SELECT=" better_hw_compatibility vp9_highbitdepth experimental + always_adjust_bpm " process_cmdline() { @@ -579,6 +592,7 @@ process_toolchain() { check_add_cflags -Wdeclaration-after-statement check_add_cflags -Wdisabled-optimization check_add_cflags -Wfloat-conversion + check_add_cflags -Wparentheses-equality check_add_cflags -Wpointer-arith check_add_cflags -Wtype-limits check_add_cflags -Wcast-qual @@ -651,7 +665,7 @@ process_toolchain() { gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh enabled werror && gen_vcproj_cmd="${gen_vcproj_cmd} --enable-werror" all_targets="${all_targets} solution" - INLINE="__forceinline" + INLINE="__inline" ;; esac diff --git a/libvpx/examples/vp8_multi_resolution_encoder.c b/libvpx/examples/vp8_multi_resolution_encoder.c index 0b9663c77..b14b1ff39 100644 --- a/libvpx/examples/vp8_multi_resolution_encoder.c +++ b/libvpx/examples/vp8_multi_resolution_encoder.c @@ -151,7 +151,7 @@ static void write_ivf_frame_header(FILE *outfile, if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return; pts = pkt->data.frame.pts; - mem_put_le32(header, pkt->data.frame.sz); + mem_put_le32(header, (int)pkt->data.frame.sz); mem_put_le32(header + 4, pts & 0xFFFFFFFF); mem_put_le32(header + 8, pts >> 32); @@ -190,7 +190,7 @@ static void set_temporal_layer_pattern(int num_temporal_layers, cfg->ts_layer_id[0] = 0; cfg->ts_layer_id[1] = 1; // Use 60/40 bit allocation as example. - cfg->ts_target_bitrate[0] = 0.6f * bitrate; + cfg->ts_target_bitrate[0] = (int)(0.6f * bitrate); cfg->ts_target_bitrate[1] = bitrate; /* 0=L, 1=GF */ @@ -241,8 +241,8 @@ static void set_temporal_layer_pattern(int num_temporal_layers, cfg->ts_layer_id[2] = 1; cfg->ts_layer_id[3] = 2; // Use 45/20/35 bit allocation as example. - cfg->ts_target_bitrate[0] = 0.45f * bitrate; - cfg->ts_target_bitrate[1] = 0.65f * bitrate; + cfg->ts_target_bitrate[0] = (int)(0.45f * bitrate); + cfg->ts_target_bitrate[1] = (int)(0.65f * bitrate); cfg->ts_target_bitrate[2] = bitrate; /* 0=L, 1=GF, 2=ARF */ @@ -294,8 +294,8 @@ int main(int argc, char **argv) { vpx_codec_err_t res[NUM_ENCODERS]; int i; - long width; - long height; + int width; + int height; int length_frame; int frame_avail; int got_data; @@ -347,9 +347,9 @@ int main(int argc, char **argv) { printf("Using %s\n", vpx_codec_iface_name(interface)); - width = strtol(argv[1], NULL, 0); - height = strtol(argv[2], NULL, 0); - framerate = strtol(argv[3], NULL, 0); + width = (int)strtol(argv[1], NULL, 0); + height = (int)strtol(argv[2], NULL, 0); + framerate = (int)strtol(argv[3], NULL, 0); if (width < 16 || width % 2 || height < 16 || height % 2) die("Invalid resolution: %ldx%ld", width, height); @@ -371,12 +371,13 @@ int main(int argc, char **argv) { // Bitrates per spatial layer: overwrite default rates above. for (i = 0; i < NUM_ENCODERS; i++) { - target_bitrate[i] = strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0); + target_bitrate[i] = (int)strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0); } // Temporal layers per spatial layers: overwrite default settings above. for (i = 0; i < NUM_ENCODERS; i++) { - num_temporal_layers[i] = strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0); + num_temporal_layers[i] = + (int)strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0); if (num_temporal_layers[i] < 1 || num_temporal_layers[i] > 3) die("Invalid temporal layers: %d, Must be 1, 2, or 3. \n", num_temporal_layers); @@ -391,9 +392,9 @@ int main(int argc, char **argv) { downsampled_input[i] = fopen(filename, "wb"); } - key_frame_insert = strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0); + key_frame_insert = (int)strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0); - show_psnr = strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0); + show_psnr = (int)strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0); /* Populate default encoder configuration */ for (i = 0; i < NUM_ENCODERS; i++) { @@ -469,7 +470,7 @@ int main(int argc, char **argv) { if (!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32)) die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h); - if (raw[0].stride[VPX_PLANE_Y] == raw[0].d_w) + if (raw[0].stride[VPX_PLANE_Y] == (int)raw[0].d_w) read_frame_p = read_frame; else read_frame_p = read_frame_by_row; @@ -558,7 +559,8 @@ int main(int argc, char **argv) { /* Write out down-sampled input. */ length_frame = cfg[i].g_w * cfg[i].g_h * 3 / 2; if (fwrite(raw[i].planes[0], 1, length_frame, - downsampled_input[NUM_ENCODERS - i - 1]) != length_frame) { + downsampled_input[NUM_ENCODERS - i - 1]) != + (unsigned int)length_frame) { return EXIT_FAILURE; } } @@ -619,10 +621,6 @@ int main(int argc, char **argv) { break; default: break; } - printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT && - (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY) - ? "K" - : ""); fflush(stdout); } } @@ -663,7 +661,6 @@ int main(int argc, char **argv) { write_ivf_file_header(outfile[i], &cfg[i], frame_cnt - 1); fclose(outfile[i]); } - printf("\n"); return EXIT_SUCCESS; } diff --git a/libvpx/examples/vp9_spatial_svc_encoder.c b/libvpx/examples/vp9_spatial_svc_encoder.c index 1f5078aad..0987cbfb8 100644 --- a/libvpx/examples/vp9_spatial_svc_encoder.c +++ b/libvpx/examples/vp9_spatial_svc_encoder.c @@ -168,7 +168,7 @@ void usage_exit(void) { static void parse_command_line(int argc, const char **argv_, AppInput *app_input, SvcContext *svc_ctx, vpx_codec_enc_cfg_t *enc_cfg) { - struct arg arg = { 0 }; + struct arg arg; char **argv = NULL; char **argi = NULL; char **argj = NULL; @@ -509,7 +509,7 @@ static void printout_rate_control_summary(struct RateControlStats *rc, } vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz, - uint32_t sizes[8], int *count) { + uint64_t sizes[8], int *count) { // A chunk ending with a byte matching 0xc0 is an invalid chunk unless // it is a super frame index. If the last byte of real video compression // data is 0xc0 the encoder must add a 0 byte. If we have the marker but @@ -606,9 +606,9 @@ void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers, } int main(int argc, const char **argv) { - AppInput app_input = { 0 }; + AppInput app_input; VpxVideoWriter *writer = NULL; - VpxVideoInfo info = { 0 }; + VpxVideoInfo info; vpx_codec_ctx_t codec; vpx_codec_enc_cfg_t enc_cfg; SvcContext svc_ctx; @@ -640,8 +640,9 @@ int main(int argc, const char **argv) { // Allocate image buffer #if CONFIG_VP9_HIGHBITDEPTH - if (!vpx_img_alloc(&raw, enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420 - : VPX_IMG_FMT_I42016, + if (!vpx_img_alloc(&raw, + enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420 + : VPX_IMG_FMT_I42016, enc_cfg.g_w, enc_cfg.g_h, 32)) { die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h); } @@ -699,12 +700,16 @@ int main(int argc, const char **argv) { vpx_codec_control(&codec, VP8E_SET_CPUUSED, svc_ctx.speed); if (svc_ctx.threads) { vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1)); - vpx_codec_control(&codec, VP9E_SET_ROW_MT, 0); + if (svc_ctx.threads > 1) + vpx_codec_control(&codec, VP9E_SET_ROW_MT, 1); + else + vpx_codec_control(&codec, VP9E_SET_ROW_MT, 0); } if (svc_ctx.speed >= 5 && svc_ctx.aqmode == 1) vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3); if (svc_ctx.speed >= 5) vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); + vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, 900); // Encode frames while (!end_of_stream) { @@ -765,7 +770,7 @@ int main(int argc, const char **argv) { SvcInternal_t *const si = (SvcInternal_t *)svc_ctx.internal; if (cx_pkt->data.frame.sz > 0) { #if OUTPUT_RC_STATS - uint32_t sizes[8]; + uint64_t sizes[8]; int count = 0; #endif vpx_video_writer_write_frame(writer, cx_pkt->data.frame.buf, @@ -777,6 +782,8 @@ int main(int argc, const char **argv) { vpx_codec_control(&codec, VP9E_GET_SVC_LAYER_ID, &layer_id); parse_superframe_index(cx_pkt->data.frame.buf, cx_pkt->data.frame.sz, sizes, &count); + if (enc_cfg.ss_number_layers == 1) + sizes[0] = cx_pkt->data.frame.sz; // Note computing input_layer_frames here won't account for frame // drops in rate control stats. // TODO(marpan): Fix this for non-bypass mode so we can get stats diff --git a/libvpx/examples/vpx_temporal_svc_encoder.c b/libvpx/examples/vpx_temporal_svc_encoder.c index c34673b05..f5736ea45 100644 --- a/libvpx/examples/vpx_temporal_svc_encoder.c +++ b/libvpx/examples/vpx_temporal_svc_encoder.c @@ -26,17 +26,27 @@ #include "../tools_common.h" #include "../video_writer.h" +#define VP8_ROI_MAP 0 + static const char *exec_name; void usage_exit(void) { exit(EXIT_FAILURE); } -// Denoiser states, for temporal denoising. -enum denoiserState { - kDenoiserOff, - kDenoiserOnYOnly, - kDenoiserOnYUV, - kDenoiserOnYUVAggressive, - kDenoiserOnAdaptive +// Denoiser states for vp8, for temporal denoising. +enum denoiserStateVp8 { + kVp8DenoiserOff, + kVp8DenoiserOnYOnly, + kVp8DenoiserOnYUV, + kVp8DenoiserOnYUVAggressive, + kVp8DenoiserOnAdaptive +}; + +// Denoiser states for vp9, for temporal denoising. +enum denoiserStateVp9 { + kVp9DenoiserOff, + kVp9DenoiserOnYOnly, + // For SVC: denoise the top two spatial layers. + kVp9DenoiserOnYTwoSpatialLayers }; static int mode_to_num_layers[13] = { 1, 2, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3, 3 }; @@ -154,6 +164,53 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc, die("Error: Number of input frames not equal to output! \n"); } +#if VP8_ROI_MAP +static void vp8_set_roi_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi) { + unsigned int i, j; + memset(roi, 0, sizeof(*roi)); + + // ROI is based on the segments (4 for vp8, 8 for vp9), smallest unit for + // segment is 16x16 for vp8, 8x8 for vp9. + roi->rows = (cfg->g_h + 15) / 16; + roi->cols = (cfg->g_w + 15) / 16; + + // Applies delta QP on the segment blocks, varies from -63 to 63. + // Setting to negative means lower QP (better quality). + // Below we set delta_q to the extreme (-63) to show strong effect. + roi->delta_q[0] = 0; + roi->delta_q[1] = -63; + roi->delta_q[2] = 0; + roi->delta_q[3] = 0; + + // Applies delta loopfilter strength on the segment blocks, varies from -63 to + // 63. Setting to positive means stronger loopfilter. + roi->delta_lf[0] = 0; + roi->delta_lf[1] = 0; + roi->delta_lf[2] = 0; + roi->delta_lf[3] = 0; + + // Applies skip encoding threshold on the segment blocks, varies from 0 to + // UINT_MAX. Larger value means more skipping of encoding is possible. + // This skip threshold only applies on delta frames. + roi->static_threshold[0] = 0; + roi->static_threshold[1] = 0; + roi->static_threshold[2] = 0; + roi->static_threshold[3] = 0; + + // Use 2 states: 1 is center square, 0 is the rest. + roi->roi_map = + (uint8_t *)calloc(roi->rows * roi->cols, sizeof(*roi->roi_map)); + for (i = 0; i < roi->rows; ++i) { + for (j = 0; j < roi->cols; ++j) { + if (i > (roi->rows >> 2) && i < ((roi->rows * 3) >> 2) && + j > (roi->cols >> 2) && j < ((roi->cols * 3) >> 2)) { + roi->roi_map[i * roi->cols + j] = 1; + } + } + } +} +#endif + // Temporal scaling parameters: // NOTE: The 3 prediction frames cannot be used interchangeably due to // differences in the way they are handled throughout the code. The @@ -506,11 +563,10 @@ int main(int argc, char **argv) { int layering_mode = 0; int layer_flags[VPX_TS_MAX_PERIODICITY] = { 0 }; int flag_periodicity = 1; -#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION) - vpx_svc_layer_id_t layer_id = { 0, 0 }; -#else - vpx_svc_layer_id_t layer_id = { 0 }; +#if VP8_ROI_MAP + vpx_roi_map_t roi; #endif + vpx_svc_layer_id_t layer_id = { 0, 0 }; const VpxInterface *encoder = NULL; FILE *infile = NULL; struct RateControlMetrics rc; @@ -637,7 +693,7 @@ int main(int argc, char **argv) { if (strncmp(encoder->name, "vp9", 3) == 0) cfg.rc_max_quantizer = 52; cfg.rc_undershoot_pct = 50; cfg.rc_overshoot_pct = 50; - cfg.rc_buf_initial_sz = 500; + cfg.rc_buf_initial_sz = 600; cfg.rc_buf_optimal_sz = 600; cfg.rc_buf_sz = 1000; @@ -707,9 +763,15 @@ int main(int argc, char **argv) { if (strncmp(encoder->name, "vp8", 3) == 0) { vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed); - vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff); + vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kVp8DenoiserOff); vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0); +#if VP8_ROI_MAP + vp8_set_roi_map(&cfg, &roi); + if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi)) + die_codec(&codec, "Failed to set ROI map"); +#endif + } else if (strncmp(encoder->name, "vp9", 3) == 0) { vpx_svc_extra_cfg_t svc_params; memset(&svc_params, 0, sizeof(svc_params)); @@ -718,7 +780,7 @@ int main(int argc, char **argv) { vpx_codec_control(&codec, VP9E_SET_GF_CBR_BOOST_PCT, 0); vpx_codec_control(&codec, VP9E_SET_FRAME_PARALLEL_DECODING, 0); vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0); - vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kDenoiserOff); + vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kVp9DenoiserOff); vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0); vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1)); @@ -746,7 +808,7 @@ int main(int argc, char **argv) { // For generating smaller key frames, use a smaller max_intra_size_pct // value, like 100 or 200. { - const int max_intra_size_pct = 900; + const int max_intra_size_pct = 1000; vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, max_intra_size_pct); } @@ -756,10 +818,8 @@ int main(int argc, char **argv) { struct vpx_usec_timer timer; vpx_codec_iter_t iter = NULL; const vpx_codec_cx_pkt_t *pkt; -#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION) // Update the temporal layer_id. No spatial layers in this test. layer_id.spatial_layer_id = 0; -#endif layer_id.temporal_layer_id = cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity]; if (strncmp(encoder->name, "vp9", 3) == 0) { diff --git a/libvpx/libs.mk b/libvpx/libs.mk index f1e924253..a3e2f9d0e 100644 --- a/libvpx/libs.mk +++ b/libvpx/libs.mk @@ -188,6 +188,13 @@ libvpx_srcs.txt: @echo $(CODEC_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@ CLEAN-OBJS += libvpx_srcs.txt +# Assembly files that are included, but don't define symbols themselves. +# Filtered out to avoid Windows build warnings. +ASM_INCLUDES := \ + third_party/x86inc/x86inc.asm \ + vpx_config.asm \ + vpx_ports/x86_abi_support.asm \ + vpx_dsp/x86/bitdepth_conversion_sse2.asm \ ifeq ($(CONFIG_EXTERNAL_BUILD),yes) ifeq ($(CONFIG_MSVS),yes) @@ -199,14 +206,6 @@ vpx.def: $(call enabled,CODEC_EXPORTS) --out=$@ $^ CLEAN-OBJS += vpx.def -# Assembly files that are included, but don't define symbols themselves. -# Filtered out to avoid Visual Studio build warnings. -ASM_INCLUDES := \ - third_party/x86inc/x86inc.asm \ - vpx_config.asm \ - vpx_ports/x86_abi_support.asm \ - vpx_dsp/x86/bitdepth_conversion_sse2.asm \ - vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def @echo " [CREATE] $@" $(qexec)$(GEN_VCPROJ) \ @@ -229,13 +228,13 @@ vpx.$(VCPROJ_SFX): $(RTCD) endif else -LIBVPX_OBJS=$(call objs,$(CODEC_SRCS)) +LIBVPX_OBJS=$(call objs, $(filter-out $(ASM_INCLUDES), $(CODEC_SRCS))) OBJS-yes += $(LIBVPX_OBJS) LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS) -SO_VERSION_MAJOR := 4 -SO_VERSION_MINOR := 1 +SO_VERSION_MAJOR := 5 +SO_VERSION_MINOR := 0 SO_VERSION_PATCH := 0 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS)) LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib @@ -406,8 +405,16 @@ CLEAN-OBJS += libvpx_test_srcs.txt $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1 @echo " [DOWNLOAD] $@" - $(qexec)trap 'rm -f $@' INT TERM &&\ - curl --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F)) + # Attempt to download the file using curl, retrying once if it fails for a + # partial file (18). + $(qexec)( \ + trap 'rm -f $@' INT TERM; \ + curl="curl --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F))"; \ + $$curl; \ + case "$$?" in \ + 18) $$curl -C -;; \ + esac \ + ) testdata:: $(LIBVPX_TEST_DATA) $(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\ diff --git a/libvpx/test/acm_random.h b/libvpx/test/acm_random.h index c2f6b0e41..d915cf913 100644 --- a/libvpx/test/acm_random.h +++ b/libvpx/test/acm_random.h @@ -11,6 +11,10 @@ #ifndef TEST_ACM_RANDOM_H_ #define TEST_ACM_RANDOM_H_ +#include <assert.h> + +#include <limits> + #include "third_party/googletest/src/include/gtest/gtest.h" #include "vpx/vpx_integer.h" @@ -50,6 +54,13 @@ class ACMRandom { return r < 128 ? r << 4 : r >> 4; } + uint32_t RandRange(const uint32_t range) { + // testing::internal::Random::Generate provides values in the range + // testing::internal::Random::kMaxRange. + assert(range <= testing::internal::Random::kMaxRange); + return random_.Generate(range); + } + int PseudoUniform(int range) { return random_.Generate(range); } int operator()(int n) { return PseudoUniform(n); } diff --git a/libvpx/test/android/Android.mk b/libvpx/test/android/Android.mk index 48872a2b6..7318de2fc 100644 --- a/libvpx/test/android/Android.mk +++ b/libvpx/test/android/Android.mk @@ -32,6 +32,7 @@ LOCAL_CPP_EXTENSION := .cc LOCAL_MODULE := gtest LOCAL_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/ LOCAL_C_INCLUDES += $(LOCAL_PATH)/third_party/googletest/src/include/ +LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/include/ LOCAL_SRC_FILES := ./third_party/googletest/src/src/gtest-all.cc include $(BUILD_STATIC_LIBRARY) diff --git a/libvpx/test/avg_test.cc b/libvpx/test/avg_test.cc index c570bbc22..ad21198e4 100644 --- a/libvpx/test/avg_test.cc +++ b/libvpx/test/avg_test.cc @@ -23,6 +23,7 @@ #include "test/register_state_check.h" #include "test/util.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; @@ -367,6 +368,21 @@ TEST_P(SatdTest, Random) { Check(expected); } +TEST_P(SatdTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 20000; + vpx_usec_timer timer; + DECLARE_ALIGNED(16, tran_low_t, coeff[1024]); + const int blocksize = GET_PARAM(0); + + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + GET_PARAM(1)(coeff, blocksize); + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer)); + printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time); +} + TEST_P(BlockErrorTestFP, MinValue) { const int64_t kMin = -32640; const int64_t expected = kMin * kMin * txfm_size_; @@ -396,6 +412,22 @@ TEST_P(BlockErrorTestFP, Random) { Check(expected); } +TEST_P(BlockErrorTestFP, DISABLED_Speed) { + const int kCountSpeedTestBlock = 20000; + vpx_usec_timer timer; + DECLARE_ALIGNED(16, tran_low_t, coeff[1024]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff[1024]); + const int blocksize = GET_PARAM(0); + + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + GET_PARAM(1)(coeff, dqcoeff, blocksize); + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer)); + printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time); +} + using std::tr1::make_tuple; INSTANTIATE_TEST_CASE_P( @@ -454,6 +486,21 @@ INSTANTIATE_TEST_CASE_P( make_tuple(1024, &vp9_block_error_fp_sse2))); #endif // HAVE_SSE2 +#if HAVE_AVX2 +INSTANTIATE_TEST_CASE_P(AVX2, SatdTest, + ::testing::Values(make_tuple(16, &vpx_satd_avx2), + make_tuple(64, &vpx_satd_avx2), + make_tuple(256, &vpx_satd_avx2), + make_tuple(1024, &vpx_satd_avx2))); + +INSTANTIATE_TEST_CASE_P( + AVX2, BlockErrorTestFP, + ::testing::Values(make_tuple(16, &vp9_block_error_fp_avx2), + make_tuple(64, &vp9_block_error_fp_avx2), + make_tuple(256, &vp9_block_error_fp_avx2), + make_tuple(1024, &vp9_block_error_fp_avx2))); +#endif + #if HAVE_NEON INSTANTIATE_TEST_CASE_P( NEON, AverageTest, diff --git a/libvpx/test/buffer.h b/libvpx/test/buffer.h index 75016c91e..2175dad9d 100644 --- a/libvpx/test/buffer.h +++ b/libvpx/test/buffer.h @@ -19,6 +19,7 @@ #include "test/acm_random.h" #include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" namespace libvpx_test { @@ -29,29 +30,55 @@ class Buffer { int right_padding, int bottom_padding) : width_(width), height_(height), top_padding_(top_padding), left_padding_(left_padding), right_padding_(right_padding), - bottom_padding_(bottom_padding) { - Init(); - } + bottom_padding_(bottom_padding), alignment_(0), padding_value_(0), + stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {} + + Buffer(int width, int height, int top_padding, int left_padding, + int right_padding, int bottom_padding, unsigned int alignment) + : width_(width), height_(height), top_padding_(top_padding), + left_padding_(left_padding), right_padding_(right_padding), + bottom_padding_(bottom_padding), alignment_(alignment), + padding_value_(0), stride_(0), raw_size_(0), num_elements_(0), + raw_buffer_(NULL) {} Buffer(int width, int height, int padding) : width_(width), height_(height), top_padding_(padding), left_padding_(padding), right_padding_(padding), - bottom_padding_(padding) { - Init(); - } + bottom_padding_(padding), alignment_(0), padding_value_(0), stride_(0), + raw_size_(0), num_elements_(0), raw_buffer_(NULL) {} - ~Buffer() { delete[] raw_buffer_; } + Buffer(int width, int height, int padding, unsigned int alignment) + : width_(width), height_(height), top_padding_(padding), + left_padding_(padding), right_padding_(padding), + bottom_padding_(padding), alignment_(alignment), padding_value_(0), + stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {} + + ~Buffer() { + if (alignment_) { + vpx_free(raw_buffer_); + } else { + delete[] raw_buffer_; + } + } T *TopLeftPixel() const; int stride() const { return stride_; } // Set the buffer (excluding padding) to 'value'. - void Set(const int value); + void Set(const T value); - // Set the buffer (excluding padding) to the output of ACMRandom function 'b'. + // Set the buffer (excluding padding) to the output of ACMRandom function + // 'rand_func'. void Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)()); + // Set the buffer (excluding padding) to the output of ACMRandom function + // 'RandRange' with range 'low' to 'high' which typically must be within + // testing::internal::Random::kMaxRange (1u << 31). However, because we want + // to allow negative low (and high) values, it is restricted to INT32_MAX + // here. + void Set(ACMRandom *rand_class, const T low, const T high); + // Copy the contents of Buffer 'a' (excluding padding). void CopyFrom(const Buffer<T> &a); @@ -63,11 +90,11 @@ class Buffer { bool HasPadding() const; // Sets all the values in the buffer to 'padding_value'. - void SetPadding(const int padding_value); + void SetPadding(const T padding_value); // Checks if all the values (excluding padding) are equal to 'value' if the // Buffers are the same size. - bool CheckValues(const int value) const; + bool CheckValues(const T value) const; // Check that padding matches the expected value or there is no padding. bool CheckPadding() const; @@ -75,21 +102,36 @@ class Buffer { // Compare the non-padding portion of two buffers if they are the same size. bool CheckValues(const Buffer<T> &a) const; - private: - void Init() { - ASSERT_GT(width_, 0); - ASSERT_GT(height_, 0); - ASSERT_GE(top_padding_, 0); - ASSERT_GE(left_padding_, 0); - ASSERT_GE(right_padding_, 0); - ASSERT_GE(bottom_padding_, 0); + bool Init() { + if (raw_buffer_ != NULL) return false; + EXPECT_GT(width_, 0); + EXPECT_GT(height_, 0); + EXPECT_GE(top_padding_, 0); + EXPECT_GE(left_padding_, 0); + EXPECT_GE(right_padding_, 0); + EXPECT_GE(bottom_padding_, 0); stride_ = left_padding_ + width_ + right_padding_; - raw_size_ = stride_ * (top_padding_ + height_ + bottom_padding_); - raw_buffer_ = new (std::nothrow) T[raw_size_]; - ASSERT_TRUE(raw_buffer_ != NULL); + num_elements_ = stride_ * (top_padding_ + height_ + bottom_padding_); + raw_size_ = num_elements_ * sizeof(T); + if (alignment_) { + EXPECT_GE(alignment_, sizeof(T)); + // Ensure alignment of the first value will be preserved. + EXPECT_EQ((left_padding_ * sizeof(T)) % alignment_, 0u); + // Ensure alignment of the subsequent rows will be preserved when there is + // a stride. + if (stride_ != width_) { + EXPECT_EQ((stride_ * sizeof(T)) % alignment_, 0u); + } + raw_buffer_ = reinterpret_cast<T *>(vpx_memalign(alignment_, raw_size_)); + } else { + raw_buffer_ = new (std::nothrow) T[num_elements_]; + } + EXPECT_TRUE(raw_buffer_ != NULL); SetPadding(std::numeric_limits<T>::max()); + return !::testing::Test::HasFailure(); } + private: bool BufferSizesMatch(const Buffer<T> &a) const; const int width_; @@ -98,44 +140,70 @@ class Buffer { const int left_padding_; const int right_padding_; const int bottom_padding_; - int padding_value_; + const unsigned int alignment_; + T padding_value_; int stride_; int raw_size_; + int num_elements_; T *raw_buffer_; }; template <typename T> T *Buffer<T>::TopLeftPixel() const { - return raw_buffer_ + (top_padding_ * stride()) + left_padding_; + if (!raw_buffer_) return NULL; + return raw_buffer_ + (top_padding_ * stride_) + left_padding_; } template <typename T> -void Buffer<T>::Set(const int value) { +void Buffer<T>::Set(const T value) { + if (!raw_buffer_) return; T *src = TopLeftPixel(); for (int height = 0; height < height_; ++height) { for (int width = 0; width < width_; ++width) { src[width] = value; } - src += stride(); + src += stride_; } } template <typename T> void Buffer<T>::Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)()) { + if (!raw_buffer_) return; T *src = TopLeftPixel(); for (int height = 0; height < height_; ++height) { for (int width = 0; width < width_; ++width) { src[width] = (*rand_class.*rand_func)(); } - src += stride(); + src += stride_; } } template <typename T> -void Buffer<T>::CopyFrom(const Buffer<T> &a) { - if (!BufferSizesMatch(a)) { - return; +void Buffer<T>::Set(ACMRandom *rand_class, const T low, const T high) { + if (!raw_buffer_) return; + + EXPECT_LE(low, high); + EXPECT_LE(static_cast<int64_t>(high) - low, + std::numeric_limits<int32_t>::max()); + + T *src = TopLeftPixel(); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + // 'low' will be promoted to unsigned given the return type of RandRange. + // Store the value as an int to avoid unsigned overflow warnings when + // 'low' is negative. + const int32_t value = + static_cast<int32_t>((*rand_class).RandRange(high - low)); + src[width] = static_cast<T>(value + low); + } + src += stride_; } +} + +template <typename T> +void Buffer<T>::CopyFrom(const Buffer<T> &a) { + if (!raw_buffer_) return; + if (!BufferSizesMatch(a)) return; T *a_src = a.TopLeftPixel(); T *b_src = this->TopLeftPixel(); @@ -150,10 +218,11 @@ void Buffer<T>::CopyFrom(const Buffer<T> &a) { template <typename T> void Buffer<T>::DumpBuffer() const { + if (!raw_buffer_) return; for (int height = 0; height < height_ + top_padding_ + bottom_padding_; ++height) { - for (int width = 0; width < stride(); ++width) { - printf("%4d", raw_buffer_[height + width * stride()]); + for (int width = 0; width < stride_; ++width) { + printf("%4d", raw_buffer_[height + width * stride_]); } printf("\n"); } @@ -161,14 +230,14 @@ void Buffer<T>::DumpBuffer() const { template <typename T> bool Buffer<T>::HasPadding() const { + if (!raw_buffer_) return false; return top_padding_ || left_padding_ || right_padding_ || bottom_padding_; } template <typename T> void Buffer<T>::PrintDifference(const Buffer<T> &a) const { - if (!BufferSizesMatch(a)) { - return; - } + if (!raw_buffer_) return; + if (!BufferSizesMatch(a)) return; T *a_src = a.TopLeftPixel(); T *b_src = TopLeftPixel(); @@ -206,17 +275,19 @@ void Buffer<T>::PrintDifference(const Buffer<T> &a) const { } template <typename T> -void Buffer<T>::SetPadding(const int padding_value) { +void Buffer<T>::SetPadding(const T padding_value) { + if (!raw_buffer_) return; padding_value_ = padding_value; T *src = raw_buffer_; - for (int i = 0; i < raw_size_; ++i) { + for (int i = 0; i < num_elements_; ++i) { src[i] = padding_value; } } template <typename T> -bool Buffer<T>::CheckValues(const int value) const { +bool Buffer<T>::CheckValues(const T value) const { + if (!raw_buffer_) return false; T *src = TopLeftPixel(); for (int height = 0; height < height_; ++height) { for (int width = 0; width < width_; ++width) { @@ -224,20 +295,19 @@ bool Buffer<T>::CheckValues(const int value) const { return false; } } - src += stride(); + src += stride_; } return true; } template <typename T> bool Buffer<T>::CheckPadding() const { - if (!HasPadding()) { - return true; - } + if (!raw_buffer_) return false; + if (!HasPadding()) return true; // Top padding. T const *top = raw_buffer_; - for (int i = 0; i < stride() * top_padding_; ++i) { + for (int i = 0; i < stride_ * top_padding_; ++i) { if (padding_value_ != top[i]) { return false; } @@ -251,7 +321,7 @@ bool Buffer<T>::CheckPadding() const { return false; } } - left += stride(); + left += stride_; } // Right padding. @@ -262,12 +332,12 @@ bool Buffer<T>::CheckPadding() const { return false; } } - right += stride(); + right += stride_; } // Bottom padding - T const *bottom = raw_buffer_ + (top_padding_ + height_) * stride(); - for (int i = 0; i < stride() * bottom_padding_; ++i) { + T const *bottom = raw_buffer_ + (top_padding_ + height_) * stride_; + for (int i = 0; i < stride_ * bottom_padding_; ++i) { if (padding_value_ != bottom[i]) { return false; } @@ -278,9 +348,8 @@ bool Buffer<T>::CheckPadding() const { template <typename T> bool Buffer<T>::CheckValues(const Buffer<T> &a) const { - if (!BufferSizesMatch(a)) { - return false; - } + if (!raw_buffer_) return false; + if (!BufferSizesMatch(a)) return false; T *a_src = a.TopLeftPixel(); T *b_src = this->TopLeftPixel(); @@ -298,6 +367,7 @@ bool Buffer<T>::CheckValues(const Buffer<T> &a) const { template <typename T> bool Buffer<T>::BufferSizesMatch(const Buffer<T> &a) const { + if (!raw_buffer_) return false; if (a.width_ != this->width_ || a.height_ != this->height_) { printf( "Reference buffer of size %dx%d does not match this buffer which is " diff --git a/libvpx/test/byte_alignment_test.cc b/libvpx/test/byte_alignment_test.cc index d78294d10..5a058b275 100644 --- a/libvpx/test/byte_alignment_test.cc +++ b/libvpx/test/byte_alignment_test.cc @@ -128,8 +128,8 @@ class ByteAlignmentTest // TODO(fgalligan): Move the MD5 testing code into another class. void OpenMd5File(const std::string &md5_file_name_) { md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_); - ASSERT_TRUE(md5_file_ != NULL) << "MD5 file open failed. Filename: " - << md5_file_name_; + ASSERT_TRUE(md5_file_ != NULL) + << "MD5 file open failed. Filename: " << md5_file_name_; } void CheckMd5(const vpx_image_t &img) { diff --git a/libvpx/test/comp_avg_pred_test.cc b/libvpx/test/comp_avg_pred_test.cc index 3feba7127..110e06583 100644 --- a/libvpx/test/comp_avg_pred_test.cc +++ b/libvpx/test/comp_avg_pred_test.cc @@ -15,7 +15,6 @@ #include "test/acm_random.h" #include "test/buffer.h" #include "test/register_state_check.h" -#include "vpx_ports/mem.h" #include "vpx_ports/vpx_timer.h" namespace { @@ -28,12 +27,13 @@ typedef void (*AvgPredFunc)(uint8_t *a, const uint8_t *b, int w, int h, uint8_t avg_with_rounding(uint8_t a, uint8_t b) { return (a + b + 1) >> 1; } -void reference_pred(const uint8_t *pred, const Buffer<uint8_t> &ref, int width, - int height, uint8_t *avg) { +void reference_pred(const Buffer<uint8_t> &pred, const Buffer<uint8_t> &ref, + int width, int height, Buffer<uint8_t> *avg) { for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { - avg[y * width + x] = avg_with_rounding( - pred[y * width + x], ref.TopLeftPixel()[y * ref.stride() + x]); + avg->TopLeftPixel()[y * avg->stride() + x] = + avg_with_rounding(pred.TopLeftPixel()[y * pred.stride() + x], + ref.TopLeftPixel()[y * ref.stride() + x]); } } } @@ -50,22 +50,10 @@ class AvgPredTest : public ::testing::TestWithParam<AvgPredFunc> { ACMRandom rnd_; }; -void fill(ACMRandom *r, uint8_t *a, const int width, const int height) { - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - a[x + width * y] = r->Rand8(); - } - } -} - TEST_P(AvgPredTest, SizeCombinations) { // This is called as part of the sub pixel variance. As such it must be one of // the variance block sizes. - DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]); - DECLARE_ALIGNED(16, uint8_t, avg_ref[64 * 64]); - DECLARE_ALIGNED(16, uint8_t, avg_chk[64 * 64]); - for (int width_pow = 2; width_pow <= 6; ++width_pow) { for (int height_pow = width_pow - 1; height_pow <= width_pow + 1; ++height_pow) { @@ -80,15 +68,28 @@ TEST_P(AvgPredTest, SizeCombinations) { // Only the reference buffer may have a stride not equal to width. Buffer<uint8_t> ref = Buffer<uint8_t>(width, height, ref_padding ? 8 : 0); + ASSERT_TRUE(ref.Init()); + Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16); + ASSERT_TRUE(pred.Init()); + Buffer<uint8_t> avg_ref = Buffer<uint8_t>(width, height, 0, 16); + ASSERT_TRUE(avg_ref.Init()); + Buffer<uint8_t> avg_chk = Buffer<uint8_t>(width, height, 0, 16); + ASSERT_TRUE(avg_chk.Init()); - fill(&rnd_, pred, width, height); ref.Set(&rnd_, &ACMRandom::Rand8); - - reference_pred(pred, ref, width, height, avg_ref); - ASM_REGISTER_STATE_CHECK(avg_pred_func_( - avg_chk, pred, width, height, ref.TopLeftPixel(), ref.stride())); - ASSERT_EQ(memcmp(avg_ref, avg_chk, sizeof(*avg_ref) * width * height), - 0); + pred.Set(&rnd_, &ACMRandom::Rand8); + + reference_pred(pred, ref, width, height, &avg_ref); + ASM_REGISTER_STATE_CHECK( + avg_pred_func_(avg_chk.TopLeftPixel(), pred.TopLeftPixel(), width, + height, ref.TopLeftPixel(), ref.stride())); + + EXPECT_TRUE(avg_chk.CheckValues(avg_ref)); + if (HasFailure()) { + printf("Width: %d Height: %d\n", width, height); + avg_chk.PrintDifference(avg_ref); + return; + } } } } @@ -98,25 +99,32 @@ TEST_P(AvgPredTest, CompareReferenceRandom) { const int width = 64; const int height = 32; Buffer<uint8_t> ref = Buffer<uint8_t>(width, height, 8); - DECLARE_ALIGNED(16, uint8_t, pred[width * height]); - DECLARE_ALIGNED(16, uint8_t, avg_ref[width * height]); - DECLARE_ALIGNED(16, uint8_t, avg_chk[width * height]); + ASSERT_TRUE(ref.Init()); + Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16); + ASSERT_TRUE(pred.Init()); + Buffer<uint8_t> avg_ref = Buffer<uint8_t>(width, height, 0, 16); + ASSERT_TRUE(avg_ref.Init()); + Buffer<uint8_t> avg_chk = Buffer<uint8_t>(width, height, 0, 16); + ASSERT_TRUE(avg_chk.Init()); for (int i = 0; i < 500; ++i) { - fill(&rnd_, pred, width, height); ref.Set(&rnd_, &ACMRandom::Rand8); + pred.Set(&rnd_, &ACMRandom::Rand8); - reference_pred(pred, ref, width, height, avg_ref); - ASM_REGISTER_STATE_CHECK(avg_pred_func_(avg_chk, pred, width, height, + reference_pred(pred, ref, width, height, &avg_ref); + ASM_REGISTER_STATE_CHECK(avg_pred_func_(avg_chk.TopLeftPixel(), + pred.TopLeftPixel(), width, height, ref.TopLeftPixel(), ref.stride())); - ASSERT_EQ(memcmp(avg_ref, avg_chk, sizeof(*avg_ref) * width * height), 0); + EXPECT_TRUE(avg_chk.CheckValues(avg_ref)); + if (HasFailure()) { + printf("Width: %d Height: %d\n", width, height); + avg_chk.PrintDifference(avg_ref); + return; + } } } TEST_P(AvgPredTest, DISABLED_Speed) { - DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]); - DECLARE_ALIGNED(16, uint8_t, avg[64 * 64]); - for (int width_pow = 2; width_pow <= 6; ++width_pow) { for (int height_pow = width_pow - 1; height_pow <= width_pow + 1; ++height_pow) { @@ -128,15 +136,20 @@ TEST_P(AvgPredTest, DISABLED_Speed) { const int height = 1 << height_pow; Buffer<uint8_t> ref = Buffer<uint8_t>(width, height, ref_padding ? 8 : 0); + ASSERT_TRUE(ref.Init()); + Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16); + ASSERT_TRUE(pred.Init()); + Buffer<uint8_t> avg = Buffer<uint8_t>(width, height, 0, 16); + ASSERT_TRUE(avg.Init()); - fill(&rnd_, pred, width, height); ref.Set(&rnd_, &ACMRandom::Rand8); + pred.Set(&rnd_, &ACMRandom::Rand8); vpx_usec_timer timer; vpx_usec_timer_start(&timer); for (int i = 0; i < 10000000 / (width * height); ++i) { - avg_pred_func_(avg, pred, width, height, ref.TopLeftPixel(), - ref.stride()); + avg_pred_func_(avg.TopLeftPixel(), pred.TopLeftPixel(), width, height, + ref.TopLeftPixel(), ref.stride()); } vpx_usec_timer_mark(&timer); @@ -156,6 +169,12 @@ INSTANTIATE_TEST_CASE_P(C, AvgPredTest, INSTANTIATE_TEST_CASE_P(SSE2, AvgPredTest, ::testing::Values(&vpx_comp_avg_pred_sse2)); #endif // HAVE_SSE2 + +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P(NEON, AvgPredTest, + ::testing::Values(&vpx_comp_avg_pred_neon)); +#endif // HAVE_NEON + #if HAVE_VSX INSTANTIATE_TEST_CASE_P(VSX, AvgPredTest, ::testing::Values(&vpx_comp_avg_pred_vsx)); diff --git a/libvpx/test/convolve_test.cc b/libvpx/test/convolve_test.cc index 535b9b07f..70f0b11a7 100644 --- a/libvpx/test/convolve_test.cc +++ b/libvpx/test/convolve_test.cc @@ -33,9 +33,9 @@ static const unsigned int kMaxDimension = 64; typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h); + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h); typedef void (*WrapperFilterBlock2d8Func)( const uint8_t *src_ptr, const unsigned int src_stride, @@ -550,7 +550,7 @@ TEST_P(ConvolveTest, DISABLED_Copy_Speed) { vpx_usec_timer_start(&timer); for (int n = 0; n < kNumTests; ++n) { - UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0, + UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0, width, height); } vpx_usec_timer_mark(&timer); @@ -570,7 +570,7 @@ TEST_P(ConvolveTest, DISABLED_Avg_Speed) { vpx_usec_timer_start(&timer); for (int n = 0; n < kNumTests; ++n) { - UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0, + UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0, width, height); } vpx_usec_timer_mark(&timer); @@ -580,12 +580,127 @@ TEST_P(ConvolveTest, DISABLED_Avg_Speed) { UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); } +TEST_P(ConvolveTest, DISABLED_Scale_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer)); + printf("convolve_scale_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_8Tap_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->hv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer)); + printf("convolve8_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_8Tap_Horiz_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->h8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer)); + printf("convolve8_horiz_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_8Tap_Vert_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->v8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer)); + printf("convolve8_vert_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->hv8_[1](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer)); + printf("convolve8_avg_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + TEST_P(ConvolveTest, Copy) { uint8_t *const in = input(); uint8_t *const out = output(); ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out, kOutputStride, - NULL, 0, NULL, 0, Width(), Height())); + NULL, 0, 0, 0, 0, Width(), Height())); CheckGuardBlocks(); @@ -604,7 +719,7 @@ TEST_P(ConvolveTest, Avg) { CopyOutputToRef(); ASM_REGISTER_STATE_CHECK(UUT_->copy_[1](in, kInputStride, out, kOutputStride, - NULL, 0, NULL, 0, Width(), Height())); + NULL, 0, 0, 0, 0, Width(), Height())); CheckGuardBlocks(); @@ -621,12 +736,10 @@ TEST_P(ConvolveTest, Avg) { TEST_P(ConvolveTest, CopyHoriz) { uint8_t *const in = input(); uint8_t *const out = output(); - DECLARE_ALIGNED(256, const int16_t, - filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 }; ASM_REGISTER_STATE_CHECK(UUT_->sh8_[0](in, kInputStride, out, kOutputStride, - filter8, 16, filter8, 16, Width(), - Height())); + vp9_filter_kernels[0], 0, 16, 0, 16, + Width(), Height())); CheckGuardBlocks(); @@ -641,12 +754,10 @@ TEST_P(ConvolveTest, CopyHoriz) { TEST_P(ConvolveTest, CopyVert) { uint8_t *const in = input(); uint8_t *const out = output(); - DECLARE_ALIGNED(256, const int16_t, - filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 }; ASM_REGISTER_STATE_CHECK(UUT_->sv8_[0](in, kInputStride, out, kOutputStride, - filter8, 16, filter8, 16, Width(), - Height())); + vp9_filter_kernels[0], 0, 16, 0, 16, + Width(), Height())); CheckGuardBlocks(); @@ -661,12 +772,10 @@ TEST_P(ConvolveTest, CopyVert) { TEST_P(ConvolveTest, Copy2D) { uint8_t *const in = input(); uint8_t *const out = output(); - DECLARE_ALIGNED(256, const int16_t, - filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 }; ASM_REGISTER_STATE_CHECK(UUT_->shv8_[0](in, kInputStride, out, kOutputStride, - filter8, 16, filter8, 16, Width(), - Height())); + vp9_filter_kernels[0], 0, 16, 0, 16, + Width(), Height())); CheckGuardBlocks(); @@ -702,7 +811,6 @@ TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) { } } -const int16_t kInvalidFilter[8] = { 0 }; const WrapperFilterBlock2d8Func wrapper_filter_block2d_8[2] = { wrapper_filter_block2d_8_c, wrapper_filter_average_block2d_8_c }; @@ -755,21 +863,21 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) { Width(), Height(), UUT_->use_highbd_); if (filter_x && filter_y) - ASM_REGISTER_STATE_CHECK(UUT_->hv8_[i]( - in, kInputStride, out, kOutputStride, filters[filter_x], 16, - filters[filter_y], 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->hv8_[i](in, kInputStride, out, kOutputStride, filters, + filter_x, 16, filter_y, 16, Width(), Height())); else if (filter_y) - ASM_REGISTER_STATE_CHECK(UUT_->v8_[i]( - in, kInputStride, out, kOutputStride, kInvalidFilter, 16, - filters[filter_y], 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->v8_[i](in, kInputStride, out, kOutputStride, filters, 0, + 16, filter_y, 16, Width(), Height())); else if (filter_x) - ASM_REGISTER_STATE_CHECK(UUT_->h8_[i]( - in, kInputStride, out, kOutputStride, filters[filter_x], 16, - kInvalidFilter, 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->h8_[i](in, kInputStride, out, kOutputStride, filters, + filter_x, 16, 0, 16, Width(), Height())); else - ASM_REGISTER_STATE_CHECK(UUT_->copy_[i]( - in, kInputStride, out, kOutputStride, kInvalidFilter, 0, - kInvalidFilter, 0, Width(), Height())); + ASM_REGISTER_STATE_CHECK(UUT_->copy_[i](in, kInputStride, out, + kOutputStride, NULL, 0, 0, + 0, 0, Width(), Height())); CheckGuardBlocks(); @@ -853,21 +961,21 @@ TEST_P(ConvolveTest, FilterExtremes) { filters[filter_y], ref, kOutputStride, Width(), Height(), UUT_->use_highbd_); if (filter_x && filter_y) - ASM_REGISTER_STATE_CHECK(UUT_->hv8_[0]( - in, kInputStride, out, kOutputStride, filters[filter_x], 16, - filters[filter_y], 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->hv8_[0](in, kInputStride, out, kOutputStride, filters, + filter_x, 16, filter_y, 16, Width(), Height())); else if (filter_y) - ASM_REGISTER_STATE_CHECK(UUT_->v8_[0]( - in, kInputStride, out, kOutputStride, kInvalidFilter, 16, - filters[filter_y], 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->v8_[0](in, kInputStride, out, kOutputStride, filters, 0, + 16, filter_y, 16, Width(), Height())); else if (filter_x) - ASM_REGISTER_STATE_CHECK(UUT_->h8_[0]( - in, kInputStride, out, kOutputStride, filters[filter_x], 16, - kInvalidFilter, 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->h8_[0](in, kInputStride, out, kOutputStride, filters, + filter_x, 16, 0, 16, Width(), Height())); else - ASM_REGISTER_STATE_CHECK(UUT_->copy_[0]( - in, kInputStride, out, kOutputStride, kInvalidFilter, 0, - kInvalidFilter, 0, Width(), Height())); + ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out, + kOutputStride, NULL, 0, 0, + 0, 0, Width(), Height())); for (int y = 0; y < Height(); ++y) { for (int x = 0; x < Width(); ++x) @@ -886,45 +994,63 @@ TEST_P(ConvolveTest, FilterExtremes) { /* This test exercises that enough rows and columns are filtered with every possible initial fractional positions and scaling steps. */ +#if !CONFIG_VP9_HIGHBITDEPTH +static const ConvolveFunc scaled_2d_c_funcs[2] = { vpx_scaled_2d_c, + vpx_scaled_avg_2d_c }; + TEST_P(ConvolveTest, CheckScalingFiltering) { uint8_t *const in = input(); uint8_t *const out = output(); - const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP]; + uint8_t ref[kOutputStride * kMaxDimension]; - SetConstantInput(127); + ::libvpx_test::ACMRandom prng; + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + const uint16_t r = prng.Rand8Extremes(); + assign_val(in, y * kInputStride + x, r); + } + } + + for (int i = 0; i < 2; ++i) { + for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) { + const InterpKernel *const eighttap = vp9_filter_kernels[filter_type]; + for (int frac = 0; frac < 16; ++frac) { + for (int step = 1; step <= 32; ++step) { + /* Test the horizontal and vertical filters in combination. */ + scaled_2d_c_funcs[i](in, kInputStride, ref, kOutputStride, eighttap, + frac, step, frac, step, Width(), Height()); + ASM_REGISTER_STATE_CHECK( + UUT_->shv8_[i](in, kInputStride, out, kOutputStride, eighttap, + frac, step, frac, step, Width(), Height())); - for (int frac = 0; frac < 16; ++frac) { - for (int step = 1; step <= 32; ++step) { - /* Test the horizontal and vertical filters in combination. */ - ASM_REGISTER_STATE_CHECK( - UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap[frac], - step, eighttap[frac], step, Width(), Height())); - - CheckGuardBlocks(); - - for (int y = 0; y < Height(); ++y) { - for (int x = 0; x < Width(); ++x) { - ASSERT_EQ(lookup(in, y * kInputStride + x), - lookup(out, y * kOutputStride + x)) - << "x == " << x << ", y == " << y << ", frac == " << frac - << ", step == " << step; + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + ASSERT_EQ(lookup(ref, y * kOutputStride + x), + lookup(out, y * kOutputStride + x)) + << "x == " << x << ", y == " << y << ", frac == " << frac + << ", step == " << step; + } + } } } } } } +#endif using std::tr1::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH -#define WRAP(func, bd) \ - void wrap_##func##_##bd( \ - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, \ - const int16_t *filter_y, int filter_y_stride, int w, int h) { \ - vpx_highbd_##func(reinterpret_cast<const uint16_t *>(src), src_stride, \ - reinterpret_cast<uint16_t *>(dst), dst_stride, filter_x, \ - filter_x_stride, filter_y, filter_y_stride, w, h, bd); \ +#define WRAP(func, bd) \ + void wrap_##func##_##bd( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ + vpx_highbd_##func(reinterpret_cast<const uint16_t *>(src), src_stride, \ + reinterpret_cast<uint16_t *>(dst), dst_stride, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); \ } #if HAVE_SSE2 && ARCH_X86_64 @@ -1161,8 +1287,8 @@ INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, #else // !CONFIG_VP9_HIGHBITDEPTH const ConvolveFunctions convolve8_avx2( vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_avx2, - vpx_convolve8_avg_horiz_ssse3, vpx_convolve8_vert_avx2, - vpx_convolve8_avg_vert_ssse3, vpx_convolve8_avx2, vpx_convolve8_avg_ssse3, + vpx_convolve8_avg_horiz_avx2, vpx_convolve8_vert_avx2, + vpx_convolve8_avg_vert_avx2, vpx_convolve8_avx2, vpx_convolve8_avg_avx2, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) }; @@ -1206,7 +1332,7 @@ const ConvolveFunctions convolve8_neon( vpx_convolve8_avg_horiz_neon, vpx_convolve8_vert_neon, vpx_convolve8_avg_vert_neon, vpx_convolve8_neon, vpx_convolve8_avg_neon, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, - vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); + vpx_scaled_avg_vert_c, vpx_scaled_2d_neon, vpx_scaled_avg_2d_c, 0); const ConvolveParam kArrayConvolve_neon[] = { ALL_SIZES(convolve8_neon) }; #endif // CONFIG_VP9_HIGHBITDEPTH @@ -1233,7 +1359,7 @@ const ConvolveFunctions convolve8_msa( vpx_convolve8_avg_horiz_msa, vpx_convolve8_vert_msa, vpx_convolve8_avg_vert_msa, vpx_convolve8_msa, vpx_convolve8_avg_msa, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, - vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); + vpx_scaled_avg_vert_c, vpx_scaled_2d_msa, vpx_scaled_avg_2d_c, 0); const ConvolveParam kArrayConvolve8_msa[] = { ALL_SIZES(convolve8_msa) }; INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest, diff --git a/libvpx/test/datarate_test.cc b/libvpx/test/datarate_test.cc index a120a88d2..31a8523d2 100644 --- a/libvpx/test/datarate_test.cc +++ b/libvpx/test/datarate_test.cc @@ -44,6 +44,7 @@ class DatarateTestLarge denoiser_offon_test_ = 0; denoiser_offon_period_ = -1; gf_boost_ = 0; + use_roi_ = 0; } virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, @@ -54,6 +55,12 @@ class DatarateTestLarge encoder->Control(VP8E_SET_GF_CBR_BOOST_PCT, gf_boost_); } +#if CONFIG_VP8_ENCODER + if (use_roi_ == 1) { + encoder->Control(VP8E_SET_ROI_MAP, &roi_); + } +#endif + if (denoiser_offon_test_) { ASSERT_GT(denoiser_offon_period_, 0) << "denoiser_offon_period_ is not positive."; @@ -91,8 +98,8 @@ class DatarateTestLarge const bool key_frame = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false; if (!key_frame) { - ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame " - << pkt->data.frame.pts; + ASSERT_GE(bits_in_buffer_model_, 0) + << "Buffer Underrun at frame " << pkt->data.frame.pts; } const int64_t frame_size_in_bits = pkt->data.frame.sz * 8; @@ -145,6 +152,8 @@ class DatarateTestLarge int denoiser_offon_period_; int set_cpu_used_; int gf_boost_; + int use_roi_; + vpx_roi_map_t roi_; }; #if CONFIG_TEMPORAL_DENOISING @@ -258,14 +267,6 @@ TEST_P(DatarateTestLarge, ChangingDropFrameThresh) { } } -// Disabled for tsan, see: -// https://bugs.chromium.org/p/webm/issues/detail?id=1049 -#if defined(__has_feature) -#if __has_feature(thread_sanitizer) -#define BUILDING_WITH_TSAN -#endif -#endif -#ifndef BUILDING_WITH_TSAN TEST_P(DatarateTestLarge, DropFramesMultiThreads) { denoiser_on_ = 0; cfg_.rc_buf_initial_sz = 500; @@ -285,7 +286,6 @@ TEST_P(DatarateTestLarge, DropFramesMultiThreads) { ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) << " The datarate for the file missed the target!"; } -#endif // !BUILDING_WITH_TSAN class DatarateTestRealTime : public DatarateTestLarge { public: @@ -402,10 +402,6 @@ TEST_P(DatarateTestRealTime, ChangingDropFrameThresh) { } } -// Disabled for tsan, see: -// https://bugs.chromium.org/p/webm/issues/detail?id=1049 - -#ifndef BUILDING_WITH_TSAN TEST_P(DatarateTestRealTime, DropFramesMultiThreads) { denoiser_on_ = 0; cfg_.rc_buf_initial_sz = 500; @@ -426,7 +422,67 @@ TEST_P(DatarateTestRealTime, DropFramesMultiThreads) { ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) << " The datarate for the file missed the target!"; } -#endif + +TEST_P(DatarateTestRealTime, RegionOfInterest) { + denoiser_on_ = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + // Encode using multiple threads. + cfg_.g_threads = 2; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + cfg_.rc_target_bitrate = 450; + cfg_.g_w = 352; + cfg_.g_h = 288; + + ResetModel(); + + // Set ROI parameters + use_roi_ = 1; + memset(&roi_, 0, sizeof(roi_)); + + roi_.rows = (cfg_.g_h + 15) / 16; + roi_.cols = (cfg_.g_w + 15) / 16; + + roi_.delta_q[0] = 0; + roi_.delta_q[1] = -20; + roi_.delta_q[2] = 0; + roi_.delta_q[3] = 0; + + roi_.delta_lf[0] = 0; + roi_.delta_lf[1] = -20; + roi_.delta_lf[2] = 0; + roi_.delta_lf[3] = 0; + + roi_.static_threshold[0] = 0; + roi_.static_threshold[1] = 1000; + roi_.static_threshold[2] = 0; + roi_.static_threshold[3] = 0; + + // Use 2 states: 1 is center square, 0 is the rest. + roi_.roi_map = + (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map)); + for (unsigned int i = 0; i < roi_.rows; ++i) { + for (unsigned int j = 0; j < roi_.cols; ++j) { + if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) && + j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) { + roi_.roi_map[i * roi_.cols + j] = 1; + } + } + } + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; + + free(roi_.roi_map); +} TEST_P(DatarateTestRealTime, GFBoost) { denoiser_on_ = 0; @@ -482,6 +538,7 @@ class DatarateTestVP9Large } denoiser_offon_test_ = 0; denoiser_offon_period_ = -1; + frame_parallel_decoding_mode_ = 1; } // @@ -561,6 +618,8 @@ class DatarateTestVP9Large encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); encoder->Control(VP9E_SET_TILE_COLUMNS, (cfg_.g_threads >> 1)); + encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, + frame_parallel_decoding_mode_); if (cfg_.ts_number_layers > 1) { if (video->frame() == 0) { @@ -599,8 +658,8 @@ class DatarateTestVP9Large duration * timebase_ * cfg_.rc_target_bitrate * 1000); // Buffer should not go negative. - ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame " - << pkt->data.frame.pts; + ASSERT_GE(bits_in_buffer_model_, 0) + << "Buffer Underrun at frame " << pkt->data.frame.pts; const size_t frame_size_in_bits = pkt->data.frame.sz * 8; @@ -641,6 +700,7 @@ class DatarateTestVP9Large int denoiser_on_; int denoiser_offon_test_; int denoiser_offon_period_; + int frame_parallel_decoding_mode_; }; // Check basic rate targeting for VBR mode with 0 lag. @@ -659,7 +719,7 @@ TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagZero) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25) + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30) << " The datarate for the file is greater than target by too much!"; } } @@ -686,7 +746,37 @@ TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagNonZero) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25) + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30) + << " The datarate for the file is greater than target by too much!"; + } +} + +// Check basic rate targeting for VBR mode with non-zero lag, with +// frame_parallel_decoding_mode off. This enables the adapt_coeff/mode/mv probs +// since error_resilience is off. +TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagNonZeroFrameParDecOff) { + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_error_resilient = 0; + cfg_.rc_end_usage = VPX_VBR; + // For non-zero lag, rate control will work (be within bounds) for + // real-time mode. + if (deadline_ == VPX_DL_REALTIME) { + cfg_.g_lag_in_frames = 15; + } else { + cfg_.g_lag_in_frames = 0; + } + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + for (int i = 400; i <= 800; i += 400) { + cfg_.rc_target_bitrate = i; + ResetModel(); + frame_parallel_decoding_mode_ = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30) << " The datarate for the file is greater than target by too much!"; } } @@ -715,6 +805,33 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting) { } } +// Check basic rate targeting for CBR mode, with frame_parallel_decoding_mode +// off( and error_resilience off). +TEST_P(DatarateTestVP9Large, BasicRateTargetingFrameParDecOff) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 0; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 140); + for (int i = 150; i < 800; i += 200) { + cfg_.rc_target_bitrate = i; + ResetModel(); + frame_parallel_decoding_mode_ = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; + } +} + // Check basic rate targeting for CBR mode, with 2 threads and dropped frames. TEST_P(DatarateTestVP9Large, BasicRateTargetingDropFramesMultiThreads) { cfg_.rc_buf_initial_sz = 500; @@ -1099,16 +1216,17 @@ class DatarateOnePassCbrSvc } virtual void ResetModel() { last_pts_ = 0; - bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz; - frame_number_ = 0; - first_drop_ = 0; - bits_total_ = 0; duration_ = 0.0; mismatch_psnr_ = 0.0; mismatch_nframes_ = 0; denoiser_on_ = 0; tune_content_ = 0; base_speed_setting_ = 5; + spatial_layer_id_ = 0; + temporal_layer_id_ = 0; + memset(bits_in_buffer_model_, 0, sizeof(bits_in_buffer_model_)); + memset(bits_total_, 0, sizeof(bits_total_)); + memset(layer_target_avg_bandwidth_, 0, sizeof(layer_target_avg_bandwidth_)); } virtual void BeginPassHook(unsigned int /*pass*/) {} virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, @@ -1139,32 +1257,94 @@ class DatarateOnePassCbrSvc timebase_ = static_cast<double>(tb.num) / tb.den; duration_ = 0; } + + virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { + vpx_svc_layer_id_t layer_id; + encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id); + spatial_layer_id_ = layer_id.spatial_layer_id; + temporal_layer_id_ = layer_id.temporal_layer_id; + // Update buffer with per-layer target frame bandwidth, this is done + // for every frame passed to the encoder (encoded or dropped). + // For temporal layers, update the cumulative buffer level. + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) { + const int layer = sl * number_temporal_layers_ + tl; + bits_in_buffer_model_[layer] += + static_cast<int64_t>(layer_target_avg_bandwidth_[layer]); + } + } + } + + vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz, + uint32_t sizes[8], int *count) { + uint8_t marker; + marker = *(data + data_sz - 1); + *count = 0; + if ((marker & 0xe0) == 0xc0) { + const uint32_t frames = (marker & 0x7) + 1; + const uint32_t mag = ((marker >> 3) & 0x3) + 1; + const size_t index_sz = 2 + mag * frames; + // This chunk is marked as having a superframe index but doesn't have + // enough data for it, thus it's an invalid superframe index. + if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME; + { + const uint8_t marker2 = *(data + data_sz - index_sz); + // This chunk is marked as having a superframe index but doesn't have + // the matching marker byte at the front of the index therefore it's an + // invalid chunk. + if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME; + } + { + uint32_t i, j; + const uint8_t *x = &data[data_sz - index_sz + 1]; + for (i = 0; i < frames; ++i) { + uint32_t this_sz = 0; + + for (j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8); + sizes[i] = this_sz; + } + *count = frames; + } + } + return VPX_CODEC_OK; + } + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { - vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; - if (last_pts_ == 0) duration = 1; - bits_in_buffer_model_ += static_cast<int64_t>( - duration * timebase_ * cfg_.rc_target_bitrate * 1000); + uint32_t sizes[8] = { 0 }; + int count = 0; + last_pts_ = pkt->data.frame.pts; const bool key_frame = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false; - if (!key_frame) { - // TODO(marpan): This check currently fails for some of the SVC tests, - // re-enable when issue (webm:1350) is resolved. - // ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame " - // << pkt->data.frame.pts; + parse_superframe_index(static_cast<const uint8_t *>(pkt->data.frame.buf), + pkt->data.frame.sz, sizes, &count); + ASSERT_EQ(count, number_spatial_layers_); + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + sizes[sl] = sizes[sl] << 3; + // Update the total encoded bits per layer. + // For temporal layers, update the cumulative encoded bits per layer. + for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) { + const int layer = sl * number_temporal_layers_ + tl; + bits_total_[layer] += static_cast<int64_t>(sizes[sl]); + // Update the per-layer buffer level with the encoded frame size. + bits_in_buffer_model_[layer] -= static_cast<int64_t>(sizes[sl]); + // There should be no buffer underrun, except on the base + // temporal layer, since there may be key frames there. + if (!key_frame && tl > 0) { + ASSERT_GE(bits_in_buffer_model_[layer], 0) + << "Buffer Underrun at frame " << pkt->data.frame.pts; + } + } } - const size_t frame_size_in_bits = pkt->data.frame.sz * 8; - bits_in_buffer_model_ -= static_cast<int64_t>(frame_size_in_bits); - bits_total_ += frame_size_in_bits; - if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1; - last_pts_ = pkt->data.frame.pts; - bits_in_last_frame_ = frame_size_in_bits; - ++frame_number_; } + virtual void EndPassHook(void) { - if (bits_total_) { - const double file_size_in_kb = bits_total_ / 1000.; // bits per kilobit - duration_ = (last_pts_ + 1) * timebase_; - file_datarate_ = file_size_in_kb / duration_; + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + for (int tl = 0; tl < number_temporal_layers_; ++tl) { + const int layer = sl * number_temporal_layers_ + tl; + const double file_size_in_kb = bits_total_[layer] / 1000.; + duration_ = (last_pts_ + 1) * timebase_; + file_datarate_[layer] = file_size_in_kb / duration_; + } } } @@ -1177,13 +1357,11 @@ class DatarateOnePassCbrSvc unsigned int GetMismatchFrames() { return mismatch_nframes_; } vpx_codec_pts_t last_pts_; - int64_t bits_in_buffer_model_; + int64_t bits_in_buffer_model_[VPX_MAX_LAYERS]; double timebase_; - int frame_number_; - vpx_codec_pts_t first_drop_; - int64_t bits_total_; + int64_t bits_total_[VPX_MAX_LAYERS]; double duration_; - double file_datarate_; + double file_datarate_[VPX_MAX_LAYERS]; size_t bits_in_last_frame_; vpx_svc_extra_cfg_t svc_params_; int speed_setting_; @@ -1192,14 +1370,22 @@ class DatarateOnePassCbrSvc int denoiser_on_; int tune_content_; int base_speed_setting_; + int spatial_layer_id_; + int temporal_layer_id_; + int number_spatial_layers_; + int number_temporal_layers_; + int layer_target_avg_bandwidth_[VPX_MAX_LAYERS]; }; static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg, const vpx_svc_extra_cfg_t *svc_params, int spatial_layers, int temporal_layers, - int temporal_layering_mode) { + int temporal_layering_mode, + int *layer_target_avg_bandwidth, + int64_t *bits_in_buffer_model) { int sl, spatial_layer_target; float total = 0; float alloc_ratio[VPX_MAX_LAYERS] = { 0 }; + float framerate = 30.0; for (sl = 0; sl < spatial_layers; ++sl) { if (svc_params->scaling_factor_den[sl] > 0) { alloc_ratio[sl] = (float)(svc_params->scaling_factor_num[sl] * 1.0 / @@ -1219,10 +1405,43 @@ static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg, } else if (temporal_layering_mode == 2) { enc_cfg->layer_target_bitrate[index] = spatial_layer_target * 2 / 3; enc_cfg->layer_target_bitrate[index + 1] = spatial_layer_target; + } else if (temporal_layering_mode <= 1) { + enc_cfg->layer_target_bitrate[index] = spatial_layer_target; + } + } + for (sl = 0; sl < spatial_layers; ++sl) { + for (int tl = 0; tl < temporal_layers; ++tl) { + const int layer = sl * temporal_layers + tl; + float layer_framerate = framerate; + if (temporal_layers == 2 && tl == 0) layer_framerate = framerate / 2; + if (temporal_layers == 3 && tl == 0) layer_framerate = framerate / 4; + if (temporal_layers == 3 && tl == 1) layer_framerate = framerate / 2; + layer_target_avg_bandwidth[layer] = static_cast<int>( + enc_cfg->layer_target_bitrate[layer] * 1000.0 / layer_framerate); + bits_in_buffer_model[layer] = + enc_cfg->layer_target_bitrate[layer] * enc_cfg->rc_buf_initial_sz; } } } +static void CheckLayerRateTargeting(vpx_codec_enc_cfg_t *const cfg, + int number_spatial_layers, + int number_temporal_layers, + double *file_datarate, + double thresh_overshoot, + double thresh_undershoot) { + for (int sl = 0; sl < number_spatial_layers; ++sl) + for (int tl = 0; tl < number_temporal_layers; ++tl) { + const int layer = sl * number_temporal_layers + tl; + ASSERT_GE(cfg->layer_target_bitrate[layer], + file_datarate[layer] * thresh_overshoot) + << " The datarate for the file exceeds the target by too much!"; + ASSERT_LE(cfg->layer_target_bitrate[layer], + file_datarate[layer] * thresh_undershoot) + << " The datarate for the file is lower than the target by too much!"; + } +} + // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1 // temporal layer, with screen content mode on and same speed setting for all // layers. @@ -1246,14 +1465,19 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TLScreenContent1) { svc_params_.scaling_factor_den[1] = 288; cfg_.rc_dropframe_thresh = 10; cfg_.kf_max_dist = 9999; - ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300); + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); cfg_.rc_target_bitrate = 500; ResetModel(); tune_content_ = 1; base_speed_setting_ = speed_setting_; assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode); + cfg_.ts_number_layers, cfg_.temporal_layering_mode, + layer_target_avg_bandwidth_, bits_in_buffer_model_); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(&cfg_, number_spatial_layers_, + number_temporal_layers_, file_datarate_, 0.78, 1.15); EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames()); } @@ -1281,26 +1505,28 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL) { svc_params_.scaling_factor_den[1] = 288; cfg_.rc_dropframe_thresh = 0; cfg_.kf_max_dist = 9999; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 200); + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); // TODO(marpan): Check that effective_datarate for each layer hits the // layer target_bitrate. for (int i = 200; i <= 800; i += 200) { cfg_.rc_target_bitrate = i; ResetModel(); assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode); + cfg_.ts_number_layers, cfg_.temporal_layering_mode, + layer_target_avg_bandwidth_, bits_in_buffer_model_); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15) - << " The datarate for the file is lower than the target by too much!"; + CheckLayerRateTargeting(&cfg_, number_spatial_layers_, + number_temporal_layers_, file_datarate_, 0.78, + 1.15); #if CONFIG_VP9_DECODER // Number of temporal layers > 1, so half of the frames in this SVC pattern // will be non-reference frame and hence encoder will avoid loopfilter. - // Since frame dropper is off, we can expcet 100 (half of the sequence) + // Since frame dropper is off, we can expect 200 (half of the sequence) // mismatched frames. - EXPECT_EQ(static_cast<unsigned int>(100), GetMismatchFrames()); + EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames()); #endif } } @@ -1329,33 +1555,41 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLDenoiserOn) { svc_params_.scaling_factor_den[1] = 288; cfg_.rc_dropframe_thresh = 0; cfg_.kf_max_dist = 9999; - ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300); + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); // TODO(marpan): Check that effective_datarate for each layer hits the // layer target_bitrate. - for (int i = 600; i <= 1000; i += 200) { - cfg_.rc_target_bitrate = i; - ResetModel(); - denoiser_on_ = 1; - assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15) - << " The datarate for the file is lower than the target by too much!"; + // For SVC, noise_sen = 1 means denoising only the top spatial layer + // noise_sen = 2 means denoising the two top spatial layers. + for (int noise_sen = 1; noise_sen <= 2; noise_sen++) { + for (int i = 600; i <= 1000; i += 200) { + cfg_.rc_target_bitrate = i; + ResetModel(); + denoiser_on_ = noise_sen; + assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, + cfg_.ts_number_layers, cfg_.temporal_layering_mode, + layer_target_avg_bandwidth_, bits_in_buffer_model_); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(&cfg_, number_spatial_layers_, + number_temporal_layers_, file_datarate_, 0.78, + 1.15); #if CONFIG_VP9_DECODER - // Number of temporal layers > 1, so half of the frames in this SVC pattern - // will be non-reference frame and hence encoder will avoid loopfilter. - // Since frame dropper is off, we can expcet 150 (half of the sequence) - // mismatched frames. - EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames()); + // Number of temporal layers > 1, so half of the frames in this SVC + // pattern + // will be non-reference frame and hence encoder will avoid loopfilter. + // Since frame dropper is off, we can expect 200 (half of the sequence) + // mismatched frames. + EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames()); #endif + } } } // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3 // temporal layers. Run CIF clip with 1 thread, and few short key frame periods. -TEST_P(DatarateOnePassCbrSvc, DISABLED_OnePassCbrSvc2SL3TLSmallKf) { +TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLSmallKf) { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 500; cfg_.rc_buf_sz = 1000; @@ -1376,27 +1610,29 @@ TEST_P(DatarateOnePassCbrSvc, DISABLED_OnePassCbrSvc2SL3TLSmallKf) { svc_params_.scaling_factor_num[1] = 288; svc_params_.scaling_factor_den[1] = 288; cfg_.rc_dropframe_thresh = 10; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 200); cfg_.rc_target_bitrate = 400; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); // For this 3 temporal layer case, pattern repeats every 4 frames, so choose // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2). for (int j = 64; j <= 67; j++) { cfg_.kf_max_dist = j; ResetModel(); assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode); + cfg_.ts_number_layers, cfg_.temporal_layering_mode, + layer_target_avg_bandwidth_, bits_in_buffer_model_); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.80) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15) - << " The datarate for the file is lower than the target by too much!"; + CheckLayerRateTargeting(&cfg_, number_spatial_layers_, + number_temporal_layers_, file_datarate_, 0.78, + 1.15); } } // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and // 3 temporal layers. Run HD clip with 4 threads. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4threads) { +TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4Threads) { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 500; cfg_.rc_buf_sz = 1000; @@ -1418,22 +1654,23 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4threads) { svc_params_.scaling_factor_den[1] = 288; cfg_.rc_dropframe_thresh = 0; cfg_.kf_max_dist = 9999; - ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300); + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); cfg_.rc_target_bitrate = 800; ResetModel(); assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode); + cfg_.ts_number_layers, cfg_.temporal_layering_mode, + layer_target_avg_bandwidth_, bits_in_buffer_model_); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15) - << " The datarate for the file is lower than the target by too much!"; + CheckLayerRateTargeting(&cfg_, number_spatial_layers_, + number_temporal_layers_, file_datarate_, 0.78, 1.15); #if CONFIG_VP9_DECODER // Number of temporal layers > 1, so half of the frames in this SVC pattern // will be non-reference frame and hence encoder will avoid loopfilter. - // Since frame dropper is off, we can expcet 150 (half of the sequence) + // Since frame dropper is off, we can expect 30 (half of the sequence) // mismatched frames. - EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames()); + EXPECT_EQ(static_cast<unsigned int>(30), GetMismatchFrames()); #endif } @@ -1463,22 +1700,24 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL) { svc_params_.scaling_factor_den[2] = 288; cfg_.rc_dropframe_thresh = 0; cfg_.kf_max_dist = 9999; - ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300); + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); cfg_.rc_target_bitrate = 800; ResetModel(); assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode); + cfg_.ts_number_layers, cfg_.temporal_layering_mode, + layer_target_avg_bandwidth_, bits_in_buffer_model_); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22) - << " The datarate for the file is lower than the target by too much!"; + CheckLayerRateTargeting(&cfg_, number_spatial_layers_, + number_temporal_layers_, file_datarate_, 0.78, 1.15); #if CONFIG_VP9_DECODER // Number of temporal layers > 1, so half of the frames in this SVC pattern // will be non-reference frame and hence encoder will avoid loopfilter. - // Since frame dropper is off, we can expcet 150 (half of the sequence) + // Since frame dropper is off, we can expect 200 (half of the sequence) // mismatched frames. - EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames()); + EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames()); #endif } @@ -1507,20 +1746,23 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TLSmallKf) { svc_params_.scaling_factor_num[2] = 288; svc_params_.scaling_factor_den[2] = 288; cfg_.rc_dropframe_thresh = 10; - ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300); cfg_.rc_target_bitrate = 800; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); // For this 3 temporal layer case, pattern repeats every 4 frames, so choose // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2). for (int j = 32; j <= 35; j++) { cfg_.kf_max_dist = j; ResetModel(); assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode); + cfg_.ts_number_layers, cfg_.temporal_layering_mode, + layer_target_avg_bandwidth_, bits_in_buffer_model_); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.80) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.30) - << " The datarate for the file is lower than the target by too much!"; + CheckLayerRateTargeting(&cfg_, number_spatial_layers_, + number_temporal_layers_, file_datarate_, 0.78, + 1.15); } } @@ -1550,22 +1792,23 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL4threads) { svc_params_.scaling_factor_den[2] = 288; cfg_.rc_dropframe_thresh = 0; cfg_.kf_max_dist = 9999; - ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300); + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); cfg_.rc_target_bitrate = 800; ResetModel(); assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode); + cfg_.ts_number_layers, cfg_.temporal_layering_mode, + layer_target_avg_bandwidth_, bits_in_buffer_model_); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22) - << " The datarate for the file is lower than the target by too much!"; + CheckLayerRateTargeting(&cfg_, number_spatial_layers_, + number_temporal_layers_, file_datarate_, 0.78, 1.15); #if CONFIG_VP9_DECODER // Number of temporal layers > 1, so half of the frames in this SVC pattern // will be non-reference frame and hence encoder will avoid loopfilter. - // Since frame dropper is off, we can expcet 150 (half of the sequence) + // Since frame dropper is off, we can expect 30 (half of the sequence) // mismatched frames. - EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames()); + EXPECT_EQ(static_cast<unsigned int>(30), GetMismatchFrames()); #endif } @@ -1597,9 +1840,19 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TL5x5MultipleRuns) { cfg_.layer_target_bitrate[0] = 300; cfg_.layer_target_bitrate[1] = 1400; cfg_.rc_target_bitrate = 1700; - ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300); + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; ResetModel(); + layer_target_avg_bandwidth_[0] = cfg_.layer_target_bitrate[0] * 1000 / 30; + bits_in_buffer_model_[0] = + cfg_.layer_target_bitrate[0] * cfg_.rc_buf_initial_sz; + layer_target_avg_bandwidth_[1] = cfg_.layer_target_bitrate[1] * 1000 / 30; + bits_in_buffer_model_[1] = + cfg_.layer_target_bitrate[1] * cfg_.rc_buf_initial_sz; + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(&cfg_, number_spatial_layers_, + number_temporal_layers_, file_datarate_, 0.78, 1.15); EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames()); } diff --git a/libvpx/test/dct16x16_test.cc b/libvpx/test/dct16x16_test.cc index 6ea77fde2..ce0bd37b3 100644 --- a/libvpx/test/dct16x16_test.cc +++ b/libvpx/test/dct16x16_test.cc @@ -542,8 +542,8 @@ class Trans16x16TestBase { const uint32_t diff = dst[j] - src[j]; #endif // CONFIG_VP9_HIGHBITDEPTH const uint32_t error = diff * diff; - EXPECT_GE(1u, error) << "Error: 16x16 IDCT has error " << error - << " at index " << j; + EXPECT_GE(1u, error) + << "Error: 16x16 IDCT has error " << error << " at index " << j; } } } @@ -744,66 +744,6 @@ TEST_P(InvTrans16x16DCT, CompareReference) { CompareInvReference(ref_txfm_, thresh_); } -class PartialTrans16x16Test : public ::testing::TestWithParam< - std::tr1::tuple<FdctFunc, vpx_bit_depth_t> > { - public: - virtual ~PartialTrans16x16Test() {} - virtual void SetUp() { - fwd_txfm_ = GET_PARAM(0); - bit_depth_ = GET_PARAM(1); - } - - virtual void TearDown() { libvpx_test::ClearSystemState(); } - - protected: - vpx_bit_depth_t bit_depth_; - FdctFunc fwd_txfm_; -}; - -TEST_P(PartialTrans16x16Test, Extremes) { -#if CONFIG_VP9_HIGHBITDEPTH - const int16_t maxval = - static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_)); -#else - const int16_t maxval = 255; -#endif - const int minval = -maxval; - DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]); - - for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval; - output[0] = 0; - ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16)); - EXPECT_EQ((maxval * kNumCoeffs) >> 1, output[0]); - - for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval; - output[0] = 0; - ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16)); - EXPECT_EQ((minval * kNumCoeffs) >> 1, output[0]); -} - -TEST_P(PartialTrans16x16Test, Random) { -#if CONFIG_VP9_HIGHBITDEPTH - const int16_t maxval = - static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_)); -#else - const int16_t maxval = 255; -#endif - DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]); - ACMRandom rnd(ACMRandom::DeterministicSeed()); - - int sum = 0; - for (int i = 0; i < kNumCoeffs; ++i) { - const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1); - input[i] = val; - sum += val; - } - output[0] = 0; - ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16)); - EXPECT_EQ(sum >> 1, output[0]); -} - using std::tr1::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH @@ -836,11 +776,6 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8), make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8), make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( - C, PartialTrans16x16Test, - ::testing::Values(make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_8), - make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_12))); #else INSTANTIATE_TEST_CASE_P( C, Trans16x16HT, @@ -849,17 +784,14 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8), make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8), make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(C, PartialTrans16x16Test, - ::testing::Values(make_tuple(&vpx_fdct16x16_1_c, - VPX_BITS_8))); #endif // CONFIG_VP9_HIGHBITDEPTH -#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( NEON, Trans16x16DCT, - ::testing::Values(make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_neon, - 0, VPX_BITS_8))); -#endif + ::testing::Values(make_tuple(&vpx_fdct16x16_neon, + &vpx_idct16x16_256_add_neon, 0, VPX_BITS_8))); +#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( @@ -876,9 +808,6 @@ INSTANTIATE_TEST_CASE_P( 2, VPX_BITS_8), make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test, - ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2, - VPX_BITS_8))); #endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE @@ -913,9 +842,6 @@ INSTANTIATE_TEST_CASE_P( &idct16x16_10_add_12_sse2, 3167, VPX_BITS_12), make_tuple(&idct16x16_12, &idct16x16_256_add_12_sse2, 3167, VPX_BITS_12))); -INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test, - ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2, - VPX_BITS_8))); #endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE @@ -931,8 +857,12 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 2, VPX_BITS_8), make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 3, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(MSA, PartialTrans16x16Test, - ::testing::Values(make_tuple(&vpx_fdct16x16_1_msa, - VPX_BITS_8))); #endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_CASE_P(VSX, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_c, + &vpx_idct16x16_256_add_vsx, + 0, VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/libvpx/test/dct32x32_test.cc b/libvpx/test/dct32x32_test.cc index d8054c4eb..a95ff9732 100644 --- a/libvpx/test/dct32x32_test.cc +++ b/libvpx/test/dct32x32_test.cc @@ -292,67 +292,6 @@ TEST_P(Trans32x32Test, InverseAccuracy) { } } -class PartialTrans32x32Test - : public ::testing::TestWithParam< - std::tr1::tuple<FwdTxfmFunc, vpx_bit_depth_t> > { - public: - virtual ~PartialTrans32x32Test() {} - virtual void SetUp() { - fwd_txfm_ = GET_PARAM(0); - bit_depth_ = GET_PARAM(1); - } - - virtual void TearDown() { libvpx_test::ClearSystemState(); } - - protected: - vpx_bit_depth_t bit_depth_; - FwdTxfmFunc fwd_txfm_; -}; - -TEST_P(PartialTrans32x32Test, Extremes) { -#if CONFIG_VP9_HIGHBITDEPTH - const int16_t maxval = - static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_)); -#else - const int16_t maxval = 255; -#endif - const int minval = -maxval; - DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]); - - for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval; - output[0] = 0; - ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32)); - EXPECT_EQ((maxval * kNumCoeffs) >> 3, output[0]); - - for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval; - output[0] = 0; - ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32)); - EXPECT_EQ((minval * kNumCoeffs) >> 3, output[0]); -} - -TEST_P(PartialTrans32x32Test, Random) { -#if CONFIG_VP9_HIGHBITDEPTH - const int16_t maxval = - static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_)); -#else - const int16_t maxval = 255; -#endif - DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]); - ACMRandom rnd(ACMRandom::DeterministicSeed()); - - int sum = 0; - for (int i = 0; i < kNumCoeffs; ++i) { - const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1); - input[i] = val; - sum += val; - } - output[0] = 0; - ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32)); - EXPECT_EQ(sum >> 3, output[0]); -} - using std::tr1::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH @@ -366,11 +305,6 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 0, VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( - C, PartialTrans32x32Test, - ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_8), - make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_12))); #else INSTANTIATE_TEST_CASE_P( C, Trans32x32Test, @@ -378,19 +312,16 @@ INSTANTIATE_TEST_CASE_P( VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(C, PartialTrans32x32Test, - ::testing::Values(make_tuple(&vpx_fdct32x32_1_c, - VPX_BITS_8))); #endif // CONFIG_VP9_HIGHBITDEPTH -#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( NEON, Trans32x32Test, - ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_neon, - 0, VPX_BITS_8), - make_tuple(&vpx_fdct32x32_rd_c, + ::testing::Values(make_tuple(&vpx_fdct32x32_neon, + &vpx_idct32x32_1024_add_neon, 0, VPX_BITS_8), + make_tuple(&vpx_fdct32x32_rd_neon, &vpx_idct32x32_1024_add_neon, 1, VPX_BITS_8))); -#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( @@ -399,9 +330,6 @@ INSTANTIATE_TEST_CASE_P( &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_sse2, &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test, - ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2, - VPX_BITS_8))); #endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE @@ -418,9 +346,6 @@ INSTANTIATE_TEST_CASE_P( VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_sse2, &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test, - ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2, - VPX_BITS_8))); #endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE @@ -439,8 +364,14 @@ INSTANTIATE_TEST_CASE_P( &vpx_idct32x32_1024_add_msa, 0, VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_msa, &vpx_idct32x32_1024_add_msa, 1, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(MSA, PartialTrans32x32Test, - ::testing::Values(make_tuple(&vpx_fdct32x32_1_msa, - VPX_BITS_8))); #endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_CASE_P( + VSX, Trans32x32Test, + ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_vsx, + 0, VPX_BITS_8), + make_tuple(&vpx_fdct32x32_rd_c, + &vpx_idct32x32_1024_add_vsx, 1, VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/libvpx/test/dct_partial_test.cc b/libvpx/test/dct_partial_test.cc new file mode 100644 index 000000000..4d145f589 --- /dev/null +++ b/libvpx/test/dct_partial_test.cc @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <math.h> +#include <stdlib.h> +#include <string.h> + +#include <limits> + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/buffer.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + +using libvpx_test::ACMRandom; +using libvpx_test::Buffer; +using std::tr1::tuple; +using std::tr1::make_tuple; + +namespace { +typedef void (*PartialFdctFunc)(const int16_t *in, tran_low_t *out, int stride); + +typedef tuple<PartialFdctFunc, int /* size */, vpx_bit_depth_t> + PartialFdctParam; + +tran_low_t partial_fdct_ref(const Buffer<int16_t> &in, int size) { + int64_t sum = 0; + for (int y = 0; y < size; ++y) { + for (int x = 0; x < size; ++x) { + sum += in.TopLeftPixel()[y * in.stride() + x]; + } + } + + switch (size) { + case 4: sum *= 2; break; + case 8: /*sum = sum;*/ break; + case 16: sum >>= 1; break; + case 32: sum >>= 3; break; + } + + return static_cast<tran_low_t>(sum); +} + +class PartialFdctTest : public ::testing::TestWithParam<PartialFdctParam> { + public: + PartialFdctTest() { + fwd_txfm_ = GET_PARAM(0); + size_ = GET_PARAM(1); + bit_depth_ = GET_PARAM(2); + } + + virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + void RunTest() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int16_t maxvalue = + clip_pixel_highbd(std::numeric_limits<int16_t>::max(), bit_depth_); + const int16_t minvalue = -maxvalue; + Buffer<int16_t> input_block = + Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16); + ASSERT_TRUE(input_block.Init()); + Buffer<tran_low_t> output_block = Buffer<tran_low_t>(size_, size_, 0, 16); + ASSERT_TRUE(output_block.Init()); + + for (int i = 0; i < 100; ++i) { + if (i == 0) { + input_block.Set(maxvalue); + } else if (i == 1) { + input_block.Set(minvalue); + } else { + input_block.Set(&rnd, minvalue, maxvalue); + } + + ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_block.TopLeftPixel(), + output_block.TopLeftPixel(), + input_block.stride())); + + EXPECT_EQ(partial_fdct_ref(input_block, size_), + output_block.TopLeftPixel()[0]); + } + } + + PartialFdctFunc fwd_txfm_; + vpx_bit_depth_t bit_depth_; + int size_; +}; + +TEST_P(PartialFdctTest, PartialFdctTest) { RunTest(); } + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + C, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_c, 32, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct32x32_1_c, 32, VPX_BITS_10), + make_tuple(&vpx_fdct32x32_1_c, 32, VPX_BITS_8), + make_tuple(&vpx_highbd_fdct16x16_1_c, 16, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct16x16_1_c, 16, VPX_BITS_10), + make_tuple(&vpx_fdct16x16_1_c, 16, VPX_BITS_8), + make_tuple(&vpx_highbd_fdct8x8_1_c, 8, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct8x8_1_c, 8, VPX_BITS_10), + make_tuple(&vpx_fdct8x8_1_c, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_c, 4, VPX_BITS_8))); +#else +INSTANTIATE_TEST_CASE_P( + C, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct32x32_1_c, 32, VPX_BITS_8), + make_tuple(&vpx_fdct16x16_1_c, 16, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_1_c, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_c, 4, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + SSE2, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2, 32, VPX_BITS_8), + make_tuple(&vpx_fdct16x16_1_sse2, 16, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_1_sse2, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_sse2, 4, VPX_BITS_8))); +#endif // HAVE_SSE2 + +#if HAVE_NEON +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + NEON, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8), + make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_12), + make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_10), + make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8))); +#else +INSTANTIATE_TEST_CASE_P( + NEON, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8), + make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_NEON + +#if HAVE_MSA +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P(MSA, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct8x8_1_msa, 8, + VPX_BITS_8))); +#else // !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + MSA, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct32x32_1_msa, 32, VPX_BITS_8), + make_tuple(&vpx_fdct16x16_1_msa, 16, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_1_msa, 8, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_MSA +} // namespace diff --git a/libvpx/test/dct_test.cc b/libvpx/test/dct_test.cc new file mode 100644 index 000000000..addbdfb46 --- /dev/null +++ b/libvpx/test/dct_test.cc @@ -0,0 +1,737 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <math.h> +#include <stdlib.h> +#include <string.h> + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/buffer.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vp9/common/vp9_entropy.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +using libvpx_test::ACMRandom; +using libvpx_test::Buffer; +using std::tr1::tuple; +using std::tr1::make_tuple; + +namespace { +typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride); +typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride); +typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride, + int tx_type); +typedef void (*FhtFuncRef)(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, + int size, int tx_type); +typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, + int tx_type); + +/* forward transform, inverse transform, size, transform type, bit depth */ +typedef tuple<FdctFunc, IdctFunc, int, int, vpx_bit_depth_t> DctParam; +typedef tuple<FhtFunc, IhtFunc, int, int, vpx_bit_depth_t> HtParam; + +void fdct_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size, + int /*tx_type*/) { + const int16_t *i = in.TopLeftPixel(); + const int i_stride = in.stride(); + tran_low_t *o = out->TopLeftPixel(); + if (size == 4) { + vpx_fdct4x4_c(i, o, i_stride); + } else if (size == 8) { + vpx_fdct8x8_c(i, o, i_stride); + } else if (size == 16) { + vpx_fdct16x16_c(i, o, i_stride); + } else if (size == 32) { + vpx_fdct32x32_c(i, o, i_stride); + } +} + +void fht_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size, + int tx_type) { + const int16_t *i = in.TopLeftPixel(); + const int i_stride = in.stride(); + tran_low_t *o = out->TopLeftPixel(); + if (size == 4) { + vp9_fht4x4_c(i, o, i_stride, tx_type); + } else if (size == 8) { + vp9_fht8x8_c(i, o, i_stride, tx_type); + } else if (size == 16) { + vp9_fht16x16_c(i, o, i_stride, tx_type); + } +} + +void fwht_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size, + int /*tx_type*/) { + ASSERT_EQ(size, 4); + vp9_fwht4x4_c(in.TopLeftPixel(), out->TopLeftPixel(), in.stride()); +} + +#if CONFIG_VP9_HIGHBITDEPTH +#define idctNxN(n, coeffs, bitdepth) \ + void idct##n##x##n##_##bitdepth(const tran_low_t *in, uint8_t *out, \ + int stride) { \ + vpx_highbd_idct##n##x##n##_##coeffs##_add_c(in, CAST_TO_SHORTPTR(out), \ + stride, bitdepth); \ + } + +idctNxN(4, 16, 10); +idctNxN(4, 16, 12); +idctNxN(8, 64, 10); +idctNxN(8, 64, 12); +idctNxN(16, 256, 10); +idctNxN(16, 256, 12); +idctNxN(32, 1024, 10); +idctNxN(32, 1024, 12); + +#define ihtNxN(n, coeffs, bitdepth) \ + void iht##n##x##n##_##bitdepth(const tran_low_t *in, uint8_t *out, \ + int stride, int tx_type) { \ + vp9_highbd_iht##n##x##n##_##coeffs##_add_c(in, CAST_TO_SHORTPTR(out), \ + stride, tx_type, bitdepth); \ + } + +ihtNxN(4, 16, 10); +ihtNxN(4, 16, 12); +ihtNxN(8, 64, 10); +ihtNxN(8, 64, 12); +ihtNxN(16, 256, 10); +// ihtNxN(16, 256, 12); + +void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); +} + +void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +class TransTestBase { + public: + virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + virtual void RunFwdTxfm(const Buffer<int16_t> &in, + Buffer<tran_low_t> *out) = 0; + + virtual void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) = 0; + + void RunAccuracyCheck(int limit) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + Buffer<int16_t> test_input_block = + Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16); + ASSERT_TRUE(test_input_block.Init()); + Buffer<tran_low_t> test_temp_block = + Buffer<tran_low_t>(size_, size_, 0, 16); + ASSERT_TRUE(test_temp_block.Init()); + Buffer<uint8_t> dst = Buffer<uint8_t>(size_, size_, 0, 16); + ASSERT_TRUE(dst.Init()); + Buffer<uint8_t> src = Buffer<uint8_t>(size_, size_, 0, 16); + ASSERT_TRUE(src.Init()); +#if CONFIG_VP9_HIGHBITDEPTH + Buffer<uint16_t> dst16 = Buffer<uint16_t>(size_, size_, 0, 16); + ASSERT_TRUE(dst16.Init()); + Buffer<uint16_t> src16 = Buffer<uint16_t>(size_, size_, 0, 16); + ASSERT_TRUE(src16.Init()); +#endif // CONFIG_VP9_HIGHBITDEPTH + uint32_t max_error = 0; + int64_t total_error = 0; + const int count_test_block = 10000; + for (int i = 0; i < count_test_block; ++i) { + if (bit_depth_ == 8) { + src.Set(&rnd, &ACMRandom::Rand8); + dst.Set(&rnd, &ACMRandom::Rand8); + // Initialize a test block with input range [-255, 255]. + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] = + src.TopLeftPixel()[h * src.stride() + w] - + dst.TopLeftPixel()[h * dst.stride() + w]; + } + } +#if CONFIG_VP9_HIGHBITDEPTH + } else { + src16.Set(&rnd, 0, max_pixel_value_); + dst16.Set(&rnd, 0, max_pixel_value_); + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] = + src16.TopLeftPixel()[h * src16.stride() + w] - + dst16.TopLeftPixel()[h * dst16.stride() + w]; + } + } +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block, &test_temp_block)); + if (bit_depth_ == VPX_BITS_8) { + ASM_REGISTER_STATE_CHECK( + RunInvTxfm(test_temp_block, dst.TopLeftPixel())); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + ASM_REGISTER_STATE_CHECK( + RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16.TopLeftPixel()))); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + int diff; +#if CONFIG_VP9_HIGHBITDEPTH + if (bit_depth_ != 8) { + diff = dst16.TopLeftPixel()[h * dst16.stride() + w] - + src16.TopLeftPixel()[h * src16.stride() + w]; + } else { +#endif // CONFIG_VP9_HIGHBITDEPTH + diff = dst.TopLeftPixel()[h * dst.stride() + w] - + src.TopLeftPixel()[h * src.stride() + w]; +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif // CONFIG_VP9_HIGHBITDEPTH + const uint32_t error = diff * diff; + if (max_error < error) max_error = error; + total_error += error; + } + } + } + + EXPECT_GE(static_cast<uint32_t>(limit), max_error) + << "Error: 4x4 FHT/IHT has an individual round trip error > " << limit; + + EXPECT_GE(count_test_block * limit, total_error) + << "Error: 4x4 FHT/IHT has average round trip error > " << limit + << " per block"; + } + + void RunCoeffCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 5000; + Buffer<int16_t> input_block = + Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16); + ASSERT_TRUE(input_block.Init()); + Buffer<tran_low_t> output_ref_block = Buffer<tran_low_t>(size_, size_, 0); + ASSERT_TRUE(output_ref_block.Init()); + Buffer<tran_low_t> output_block = Buffer<tran_low_t>(size_, size_, 0, 16); + ASSERT_TRUE(output_block.Init()); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-max_pixel_value_, + // max_pixel_value_]. + input_block.Set(&rnd, -max_pixel_value_, max_pixel_value_); + + fwd_txfm_ref(input_block, &output_ref_block, size_, tx_type_); + ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, &output_block)); + + // The minimum quant value is 4. + EXPECT_TRUE(output_block.CheckValues(output_ref_block)); + if (::testing::Test::HasFailure()) { + printf("Size: %d Transform type: %d\n", size_, tx_type_); + output_block.PrintDifference(output_ref_block); + return; + } + } + } + + void RunMemCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 5000; + Buffer<int16_t> input_extreme_block = + Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16); + ASSERT_TRUE(input_extreme_block.Init()); + Buffer<tran_low_t> output_ref_block = Buffer<tran_low_t>(size_, size_, 0); + ASSERT_TRUE(output_ref_block.Init()); + Buffer<tran_low_t> output_block = Buffer<tran_low_t>(size_, size_, 0, 16); + ASSERT_TRUE(output_block.Init()); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with -max_pixel_value_ or max_pixel_value_. + if (i == 0) { + input_extreme_block.Set(max_pixel_value_); + } else if (i == 1) { + input_extreme_block.Set(-max_pixel_value_); + } else { + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + input_extreme_block + .TopLeftPixel()[h * input_extreme_block.stride() + w] = + rnd.Rand8() % 2 ? max_pixel_value_ : -max_pixel_value_; + } + } + } + + fwd_txfm_ref(input_extreme_block, &output_ref_block, size_, tx_type_); + ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block, &output_block)); + + // The minimum quant value is 4. + EXPECT_TRUE(output_block.CheckValues(output_ref_block)); + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + EXPECT_GE( + 4 * DCT_MAX_VALUE << (bit_depth_ - 8), + abs(output_block.TopLeftPixel()[h * output_block.stride() + w])) + << "Error: 4x4 FDCT has coefficient larger than " + "4*DCT_MAX_VALUE" + << " at " << w << "," << h; + if (::testing::Test::HasFailure()) { + printf("Size: %d Transform type: %d\n", size_, tx_type_); + output_block.DumpBuffer(); + return; + } + } + } + } + } + + void RunInvAccuracyCheck(int limit) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 1000; + Buffer<int16_t> in = Buffer<int16_t>(size_, size_, 4); + ASSERT_TRUE(in.Init()); + Buffer<tran_low_t> coeff = Buffer<tran_low_t>(size_, size_, 0, 16); + ASSERT_TRUE(coeff.Init()); + Buffer<uint8_t> dst = Buffer<uint8_t>(size_, size_, 0, 16); + ASSERT_TRUE(dst.Init()); + Buffer<uint8_t> src = Buffer<uint8_t>(size_, size_, 0); + ASSERT_TRUE(src.Init()); + Buffer<uint16_t> dst16 = Buffer<uint16_t>(size_, size_, 0, 16); + ASSERT_TRUE(dst16.Init()); + Buffer<uint16_t> src16 = Buffer<uint16_t>(size_, size_, 0); + ASSERT_TRUE(src16.Init()); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-max_pixel_value_, + // max_pixel_value_]. + if (bit_depth_ == VPX_BITS_8) { + src.Set(&rnd, &ACMRandom::Rand8); + dst.Set(&rnd, &ACMRandom::Rand8); + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + in.TopLeftPixel()[h * in.stride() + w] = + src.TopLeftPixel()[h * src.stride() + w] - + dst.TopLeftPixel()[h * dst.stride() + w]; + } + } +#if CONFIG_VP9_HIGHBITDEPTH + } else { + src16.Set(&rnd, 0, max_pixel_value_); + dst16.Set(&rnd, 0, max_pixel_value_); + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + in.TopLeftPixel()[h * in.stride() + w] = + src16.TopLeftPixel()[h * src16.stride() + w] - + dst16.TopLeftPixel()[h * dst16.stride() + w]; + } + } +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + fwd_txfm_ref(in, &coeff, size_, tx_type_); + + if (bit_depth_ == VPX_BITS_8) { + ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst.TopLeftPixel())); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + ASM_REGISTER_STATE_CHECK( + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16.TopLeftPixel()))); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + int diff; +#if CONFIG_VP9_HIGHBITDEPTH + if (bit_depth_ != 8) { + diff = dst16.TopLeftPixel()[h * dst16.stride() + w] - + src16.TopLeftPixel()[h * src16.stride() + w]; + } else { +#endif // CONFIG_VP9_HIGHBITDEPTH + diff = dst.TopLeftPixel()[h * dst.stride() + w] - + src.TopLeftPixel()[h * src.stride() + w]; +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif // CONFIG_VP9_HIGHBITDEPTH + const uint32_t error = diff * diff; + EXPECT_GE(static_cast<uint32_t>(limit), error) + << "Error: " << size_ << "x" << size_ << " IDCT has error " + << error << " at " << w << "," << h; + } + } + } + } + + FhtFuncRef fwd_txfm_ref; + vpx_bit_depth_t bit_depth_; + int tx_type_; + int max_pixel_value_; + int size_; +}; + +class TransDCT : public TransTestBase, + public ::testing::TestWithParam<DctParam> { + public: + TransDCT() { + fwd_txfm_ref = fdct_ref; + fwd_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + size_ = GET_PARAM(2); + tx_type_ = GET_PARAM(3); + bit_depth_ = GET_PARAM(4); + max_pixel_value_ = (1 << bit_depth_) - 1; + } + + protected: + void RunFwdTxfm(const Buffer<int16_t> &in, Buffer<tran_low_t> *out) { + fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride()); + } + + void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) { + inv_txfm_(in.TopLeftPixel(), out, in.stride()); + } + + FdctFunc fwd_txfm_; + IdctFunc inv_txfm_; +}; + +TEST_P(TransDCT, AccuracyCheck) { RunAccuracyCheck(1); } + +TEST_P(TransDCT, CoeffCheck) { RunCoeffCheck(); } + +TEST_P(TransDCT, MemCheck) { RunMemCheck(); } + +TEST_P(TransDCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); } + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + C, TransDCT, + ::testing::Values( + make_tuple(&vpx_highbd_fdct32x32_c, &idct32x32_10, 32, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct32x32_c, &idct32x32_12, 32, 0, VPX_BITS_10), + make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 32, 0, + VPX_BITS_8), + make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_10, 16, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_12, 16, 0, VPX_BITS_10), + make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 16, 0, + VPX_BITS_8), + make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_10, 8, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_12, 8, 0, VPX_BITS_10), + make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 8, 0, VPX_BITS_8), + make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 4, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 4, 0, VPX_BITS_12), + make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 4, 0, VPX_BITS_8))); +#else +INSTANTIATE_TEST_CASE_P( + C, TransDCT, + ::testing::Values( + make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 32, 0, + VPX_BITS_8), + make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 16, 0, + VPX_BITS_8), + make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 8, 0, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 4, 0, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_SSE2 +#if !CONFIG_EMULATE_HARDWARE +#if CONFIG_VP9_HIGHBITDEPTH +/* TODO:(johannkoenig) Determine why these fail AccuracyCheck + make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_12, 32, 0, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_12, 16, 0, VPX_BITS_12), +*/ +INSTANTIATE_TEST_CASE_P( + SSE2, TransDCT, + ::testing::Values( + make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_10, 32, 0, + VPX_BITS_10), + make_tuple(&vpx_fdct32x32_sse2, &vpx_idct32x32_1024_add_sse2, 32, 0, + VPX_BITS_8), + make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_10, 16, 0, + VPX_BITS_10), + make_tuple(&vpx_fdct16x16_sse2, &vpx_idct16x16_256_add_sse2, 16, 0, + VPX_BITS_8), + make_tuple(&vpx_highbd_fdct8x8_sse2, &idct8x8_10, 8, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct8x8_sse2, &idct8x8_12, 8, 0, VPX_BITS_12), + make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_sse2, 8, 0, + VPX_BITS_8), + make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10, 4, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12, 4, 0, VPX_BITS_12), + make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_sse2, 4, 0, + VPX_BITS_8))); +#else +INSTANTIATE_TEST_CASE_P( + SSE2, TransDCT, + ::testing::Values(make_tuple(&vpx_fdct32x32_sse2, + &vpx_idct32x32_1024_add_sse2, 32, 0, + VPX_BITS_8), + make_tuple(&vpx_fdct16x16_sse2, + &vpx_idct16x16_256_add_sse2, 16, 0, + VPX_BITS_8), + make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_sse2, 8, + 0, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_sse2, 4, + 0, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // !CONFIG_EMULATE_HARDWARE +#endif // HAVE_SSE2 + +#if !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_SSSE3 && !CONFIG_EMULATE_HARDWARE +#if !ARCH_X86_64 +// TODO(johannkoenig): high bit depth fdct8x8. +INSTANTIATE_TEST_CASE_P( + SSSE3, TransDCT, + ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_sse2, + 32, 0, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_sse2, 8, 0, + VPX_BITS_8))); +#else +// vpx_fdct8x8_ssse3 is only available in 64 bit builds. +INSTANTIATE_TEST_CASE_P( + SSSE3, TransDCT, + ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_sse2, + 32, 0, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_ssse3, &vpx_idct8x8_64_add_sse2, + 8, 0, VPX_BITS_8))); +#endif // !ARCH_X86_64 +#endif // HAVE_SSSE3 && !CONFIG_EMULATE_HARDWARE +#endif // !CONFIG_VP9_HIGHBITDEPTH + +#if !CONFIG_VP9_HIGHBITDEPTH && HAVE_AVX2 && !CONFIG_EMULATE_HARDWARE +// TODO(johannkoenig): high bit depth fdct32x32. +INSTANTIATE_TEST_CASE_P( + AVX2, TransDCT, ::testing::Values(make_tuple(&vpx_fdct32x32_avx2, + &vpx_idct32x32_1024_add_sse2, + 32, 0, VPX_BITS_8))); + +#endif // !CONFIG_VP9_HIGHBITDEPTH && HAVE_AVX2 && !CONFIG_EMULATE_HARDWARE + +#if HAVE_NEON +#if !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_CASE_P( + NEON, TransDCT, + ::testing::Values(make_tuple(&vpx_fdct32x32_neon, + &vpx_idct32x32_1024_add_neon, 32, 0, + VPX_BITS_8), + make_tuple(&vpx_fdct16x16_neon, + &vpx_idct16x16_256_add_neon, 16, 0, + VPX_BITS_8), + make_tuple(&vpx_fdct8x8_neon, &vpx_idct8x8_64_add_neon, 8, + 0, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_neon, &vpx_idct4x4_16_add_neon, 4, + 0, VPX_BITS_8))); +#endif // !CONFIG_EMULATE_HARDWARE +#endif // HAVE_NEON + +#if HAVE_MSA +#if !CONFIG_VP9_HIGHBITDEPTH +#if !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_CASE_P( + MSA, TransDCT, + ::testing::Values( + make_tuple(&vpx_fdct32x32_msa, &vpx_idct32x32_1024_add_msa, 32, 0, + VPX_BITS_8), + make_tuple(&vpx_fdct16x16_msa, &vpx_idct16x16_256_add_msa, 16, 0, + VPX_BITS_8), + make_tuple(&vpx_fdct8x8_msa, &vpx_idct8x8_64_add_msa, 8, 0, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_msa, &vpx_idct4x4_16_add_msa, 4, 0, + VPX_BITS_8))); +#endif // !CONFIG_EMULATE_HARDWARE +#endif // !CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_MSA + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_CASE_P(VSX, TransDCT, + ::testing::Values(make_tuple(&vpx_fdct4x4_c, + &vpx_idct4x4_16_add_vsx, 4, + 0, VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +class TransHT : public TransTestBase, public ::testing::TestWithParam<HtParam> { + public: + TransHT() { + fwd_txfm_ref = fht_ref; + fwd_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + size_ = GET_PARAM(2); + tx_type_ = GET_PARAM(3); + bit_depth_ = GET_PARAM(4); + max_pixel_value_ = (1 << bit_depth_) - 1; + } + + protected: + void RunFwdTxfm(const Buffer<int16_t> &in, Buffer<tran_low_t> *out) { + fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride(), tx_type_); + } + + void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) { + inv_txfm_(in.TopLeftPixel(), out, in.stride(), tx_type_); + } + + FhtFunc fwd_txfm_; + IhtFunc inv_txfm_; +}; + +TEST_P(TransHT, AccuracyCheck) { RunAccuracyCheck(1); } + +TEST_P(TransHT, CoeffCheck) { RunCoeffCheck(); } + +TEST_P(TransHT, MemCheck) { RunMemCheck(); } + +TEST_P(TransHT, InvAccuracyCheck) { RunInvAccuracyCheck(1); } + +/* TODO:(johannkoenig) Determine why these fail AccuracyCheck + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 0, VPX_BITS_12), + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 1, VPX_BITS_12), + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 2, VPX_BITS_12), + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 3, VPX_BITS_12), + */ +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + C, TransHT, + ::testing::Values( + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 0, VPX_BITS_10), + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 1, VPX_BITS_10), + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 2, VPX_BITS_10), + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 3, VPX_BITS_10), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 0, VPX_BITS_8), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 1, VPX_BITS_8), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 2, VPX_BITS_8), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 3, VPX_BITS_8), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 0, VPX_BITS_10), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 1, VPX_BITS_10), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 2, VPX_BITS_10), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 3, VPX_BITS_10), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 0, VPX_BITS_12), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 1, VPX_BITS_12), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 2, VPX_BITS_12), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 3, VPX_BITS_12), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 0, VPX_BITS_8), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 1, VPX_BITS_8), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 2, VPX_BITS_8), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 3, VPX_BITS_8), + make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 0, VPX_BITS_10), + make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 1, VPX_BITS_10), + make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 2, VPX_BITS_10), + make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 3, VPX_BITS_10), + make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 0, VPX_BITS_12), + make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 1, VPX_BITS_12), + make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 2, VPX_BITS_12), + make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 3, VPX_BITS_12), + make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 0, VPX_BITS_8), + make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 1, VPX_BITS_8), + make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 2, VPX_BITS_8), + make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 3, VPX_BITS_8))); +#else +INSTANTIATE_TEST_CASE_P( + C, TransHT, + ::testing::Values( + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 0, VPX_BITS_8), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 1, VPX_BITS_8), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 2, VPX_BITS_8), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 3, VPX_BITS_8), + + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 0, VPX_BITS_8), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 1, VPX_BITS_8), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 2, VPX_BITS_8), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 3, VPX_BITS_8), + + make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 0, VPX_BITS_8), + make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 1, VPX_BITS_8), + make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 2, VPX_BITS_8), + make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 3, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + SSE2, TransHT, + ::testing::Values( + make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 0, + VPX_BITS_8), + make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 1, + VPX_BITS_8), + make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 2, + VPX_BITS_8), + make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 3, + VPX_BITS_8), + + make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 0, VPX_BITS_8), + make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 1, VPX_BITS_8), + make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 2, VPX_BITS_8), + make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 3, VPX_BITS_8), + + make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 0, VPX_BITS_8), + make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 1, VPX_BITS_8), + make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 2, VPX_BITS_8), + make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 3, + VPX_BITS_8))); +#endif // HAVE_SSE2 + +class TransWHT : public TransTestBase, + public ::testing::TestWithParam<DctParam> { + public: + TransWHT() { + fwd_txfm_ref = fwht_ref; + fwd_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + size_ = GET_PARAM(2); + tx_type_ = GET_PARAM(3); + bit_depth_ = GET_PARAM(4); + max_pixel_value_ = (1 << bit_depth_) - 1; + } + + protected: + void RunFwdTxfm(const Buffer<int16_t> &in, Buffer<tran_low_t> *out) { + fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride()); + } + + void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) { + inv_txfm_(in.TopLeftPixel(), out, in.stride()); + } + + FdctFunc fwd_txfm_; + IdctFunc inv_txfm_; +}; + +TEST_P(TransWHT, AccuracyCheck) { RunAccuracyCheck(0); } + +TEST_P(TransWHT, CoeffCheck) { RunCoeffCheck(); } + +TEST_P(TransWHT, MemCheck) { RunMemCheck(); } + +TEST_P(TransWHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); } + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + C, TransWHT, + ::testing::Values( + make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 4, 0, VPX_BITS_10), + make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 4, 0, VPX_BITS_12), + make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 4, 0, VPX_BITS_8))); +#else +INSTANTIATE_TEST_CASE_P(C, TransWHT, + ::testing::Values(make_tuple(&vp9_fwht4x4_c, + &vpx_iwht4x4_16_add_c, 4, + 0, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P(SSE2, TransWHT, + ::testing::Values(make_tuple(&vp9_fwht4x4_sse2, + &vpx_iwht4x4_16_add_sse2, + 4, 0, VPX_BITS_8))); +#endif // HAVE_SSE2 +} // namespace diff --git a/libvpx/test/decode_test_driver.cc b/libvpx/test/decode_test_driver.cc index b738e0db1..48680eb8e 100644 --- a/libvpx/test/decode_test_driver.cc +++ b/libvpx/test/decode_test_driver.cc @@ -53,13 +53,13 @@ void DecoderTest::HandlePeekResult(Decoder *const decoder, * pass it is not a keyframe, so we only expect VPX_CODEC_OK on the first * frame, which must be a keyframe. */ if (video->frame_number() == 0) - ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: " - << vpx_codec_err_to_string(res_peek); + ASSERT_EQ(VPX_CODEC_OK, res_peek) + << "Peek return failed: " << vpx_codec_err_to_string(res_peek); } else { /* The Vp9 implementation of PeekStream returns an error only if the * data passed to it isn't a valid Vp9 chunk. */ - ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: " - << vpx_codec_err_to_string(res_peek); + ASSERT_EQ(VPX_CODEC_OK, res_peek) + << "Peek return failed: " << vpx_codec_err_to_string(res_peek); } } diff --git a/libvpx/test/encode_api_test.cc b/libvpx/test/encode_api_test.cc index f685493aa..87e29b61d 100644 --- a/libvpx/test/encode_api_test.cc +++ b/libvpx/test/encode_api_test.cc @@ -79,4 +79,117 @@ TEST(EncodeAPI, HighBitDepthCapability) { #endif } +#if CONFIG_VP8_ENCODER +TEST(EncodeAPI, ImageSizeSetting) { + const int width = 711; + const int height = 360; + const int bps = 12; + vpx_image_t img; + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + uint8_t *img_buf = reinterpret_cast<uint8_t *>( + calloc(width * height * bps / 8, sizeof(*img_buf))); + vpx_codec_enc_config_default(vpx_codec_vp8_cx(), &cfg, 0); + + cfg.g_w = width; + cfg.g_h = height; + + vpx_img_wrap(&img, VPX_IMG_FMT_I420, width, height, 1, img_buf); + + vpx_codec_enc_init(&enc, vpx_codec_vp8_cx(), &cfg, 0); + + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, &img, 0, 1, 0, 0)); + + free(img_buf); + + vpx_codec_destroy(&enc); +} +#endif + +// Set up 2 spatial streams with 2 temporal layers per stream, and generate +// invalid configuration by setting the temporal layer rate allocation +// (ts_target_bitrate[]) to 0 for both layers. This should fail independent of +// CONFIG_MULTI_RES_ENCODING. +TEST(EncodeAPI, MultiResEncode) { + static const vpx_codec_iface_t *kCodecs[] = { +#if CONFIG_VP8_ENCODER + &vpx_codec_vp8_cx_algo, +#endif +#if CONFIG_VP9_ENCODER + &vpx_codec_vp9_cx_algo, +#endif + }; + const int width = 1280; + const int height = 720; + const int width_down = width / 2; + const int height_down = height / 2; + const int target_bitrate = 1000; + const int framerate = 30; + + for (int c = 0; c < NELEMENTS(kCodecs); ++c) { + const vpx_codec_iface_t *const iface = kCodecs[c]; + vpx_codec_ctx_t enc[2]; + vpx_codec_enc_cfg_t cfg[2]; + vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } }; + + memset(enc, 0, sizeof(enc)); + + for (int i = 0; i < 2; i++) { + vpx_codec_enc_config_default(iface, &cfg[i], 0); + } + + /* Highest-resolution encoder settings */ + cfg[0].g_w = width; + cfg[0].g_h = height; + cfg[0].rc_dropframe_thresh = 0; + cfg[0].rc_end_usage = VPX_CBR; + cfg[0].rc_resize_allowed = 0; + cfg[0].rc_min_quantizer = 2; + cfg[0].rc_max_quantizer = 56; + cfg[0].rc_undershoot_pct = 100; + cfg[0].rc_overshoot_pct = 15; + cfg[0].rc_buf_initial_sz = 500; + cfg[0].rc_buf_optimal_sz = 600; + cfg[0].rc_buf_sz = 1000; + cfg[0].g_error_resilient = 1; /* Enable error resilient mode */ + cfg[0].g_lag_in_frames = 0; + + cfg[0].kf_mode = VPX_KF_AUTO; + cfg[0].kf_min_dist = 3000; + cfg[0].kf_max_dist = 3000; + + cfg[0].rc_target_bitrate = target_bitrate; /* Set target bitrate */ + cfg[0].g_timebase.num = 1; /* Set fps */ + cfg[0].g_timebase.den = framerate; + + memcpy(&cfg[1], &cfg[0], sizeof(cfg[0])); + cfg[1].rc_target_bitrate = 500; + cfg[1].g_w = width_down; + cfg[1].g_h = height_down; + + for (int i = 0; i < 2; i++) { + cfg[i].ts_number_layers = 2; + cfg[i].ts_periodicity = 2; + cfg[i].ts_rate_decimator[0] = 2; + cfg[i].ts_rate_decimator[1] = 1; + cfg[i].ts_layer_id[0] = 0; + cfg[i].ts_layer_id[1] = 1; + // Invalid parameters. + cfg[i].ts_target_bitrate[0] = 0; + cfg[i].ts_target_bitrate[1] = 0; + } + + // VP9 should report incapable, VP8 invalid for all configurations. + const char kVP9Name[] = "WebM Project VP9"; + const bool is_vp9 = strncmp(kVP9Name, vpx_codec_iface_name(iface), + sizeof(kVP9Name) - 1) == 0; + EXPECT_EQ(is_vp9 ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM, + vpx_codec_enc_init_multi(&enc[0], iface, &cfg[0], 2, 0, &dsf[0])); + + for (int i = 0; i < 2; i++) { + vpx_codec_destroy(&enc[i]); + } + } +} + } // namespace diff --git a/libvpx/test/encode_test_driver.cc b/libvpx/test/encode_test_driver.cc index 5d2b4008a..b2cbc3f05 100644 --- a/libvpx/test/encode_test_driver.cc +++ b/libvpx/test/encode_test_driver.cc @@ -201,6 +201,8 @@ void EncoderTest::RunLoop(VideoSource *video) { PreEncodeFrameHook(video, encoder.get()); encoder->EncodeFrame(video, frame_flags_); + PostEncodeFrameHook(encoder.get()); + CxDataIterator iter = encoder->GetCxData(); bool has_cxdata = false; diff --git a/libvpx/test/encode_test_driver.h b/libvpx/test/encode_test_driver.h index 08a57ad77..89a3b1767 100644 --- a/libvpx/test/encode_test_driver.h +++ b/libvpx/test/encode_test_driver.h @@ -139,6 +139,13 @@ class Encoder { } #endif +#if CONFIG_VP8_ENCODER + void Control(int ctrl_id, vpx_roi_map_t *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } +#endif + void Config(const vpx_codec_enc_cfg_t *cfg) { const vpx_codec_err_t res = vpx_codec_enc_config_set(&encoder_, cfg); ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); @@ -212,6 +219,8 @@ class EncoderTest { virtual void PreEncodeFrameHook(VideoSource * /*video*/, Encoder * /*encoder*/) {} + virtual void PostEncodeFrameHook(Encoder * /*encoder*/) {} + // Hook to be called on every compressed data packet. virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {} diff --git a/libvpx/test/external_frame_buffer_test.cc b/libvpx/test/external_frame_buffer_test.cc index f9686695a..dbf297119 100644 --- a/libvpx/test/external_frame_buffer_test.cc +++ b/libvpx/test/external_frame_buffer_test.cc @@ -34,7 +34,8 @@ struct ExternalFrameBuffer { // Class to manipulate a list of external frame buffers. class ExternalFrameBufferList { public: - ExternalFrameBufferList() : num_buffers_(0), ext_fb_list_(NULL) {} + ExternalFrameBufferList() + : num_buffers_(0), num_used_buffers_(0), ext_fb_list_(NULL) {} virtual ~ExternalFrameBufferList() { for (int i = 0; i < num_buffers_; ++i) { @@ -71,6 +72,8 @@ class ExternalFrameBufferList { } SetFrameBuffer(idx, fb); + + num_used_buffers_++; return 0; } @@ -106,6 +109,7 @@ class ExternalFrameBufferList { } EXPECT_EQ(1, ext_fb->in_use); ext_fb->in_use = 0; + num_used_buffers_--; return 0; } @@ -121,6 +125,8 @@ class ExternalFrameBufferList { } } + int num_used_buffers() const { return num_used_buffers_; } + private: // Returns the index of the first free frame buffer. Returns |num_buffers_| // if there are no free frame buffers. @@ -145,6 +151,7 @@ class ExternalFrameBufferList { } int num_buffers_; + int num_used_buffers_; ExternalFrameBuffer *ext_fb_list_; }; @@ -220,8 +227,8 @@ class ExternalFrameBufferMD5Test void OpenMD5File(const std::string &md5_file_name_) { md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_); - ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: " - << md5_file_name_; + ASSERT_TRUE(md5_file_ != NULL) + << "Md5 file open failed. Filename: " << md5_file_name_; } virtual void DecompressedFrameHook(const vpx_image_t &img, @@ -273,6 +280,7 @@ class ExternalFrameBufferMD5Test #if CONFIG_WEBM_IO const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm"; +const char kVP9NonRefTestFile[] = "vp90-2-22-svc_1280x720_1.webm"; // Class for testing passing in external frame buffers to libvpx. class ExternalFrameBufferTest : public ::testing::Test { @@ -292,7 +300,9 @@ class ExternalFrameBufferTest : public ::testing::Test { virtual void TearDown() { delete decoder_; + decoder_ = NULL; delete video_; + video_ = NULL; } // Passes the external frame buffer information to libvpx. @@ -325,7 +335,7 @@ class ExternalFrameBufferTest : public ::testing::Test { return VPX_CODEC_OK; } - private: + protected: void CheckDecodedFrames() { libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData(); const vpx_image_t *img = NULL; @@ -341,6 +351,25 @@ class ExternalFrameBufferTest : public ::testing::Test { int num_buffers_; ExternalFrameBufferList fb_list_; }; + +class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest { + protected: + virtual void SetUp() { + video_ = new libvpx_test::WebMVideoSource(kVP9NonRefTestFile); + ASSERT_TRUE(video_ != NULL); + video_->Init(); + video_->Begin(); + + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); + decoder_ = new libvpx_test::VP9Decoder(cfg, 0); + ASSERT_TRUE(decoder_ != NULL); + } + + virtual void CheckFrameBufferRelease() { + TearDown(); + ASSERT_EQ(0, fb_list_.num_used_buffers()); + } +}; #endif // CONFIG_WEBM_IO // This test runs through the set of test vectors, and decodes them. @@ -419,6 +448,8 @@ TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) { SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, release_vp9_frame_buffer)); ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame()); + // Only run this on long clips. Decoding a very short clip will return + // VPX_CODEC_OK even with only 2 buffers. ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeRemainingFrames()); } @@ -467,6 +498,15 @@ TEST_F(ExternalFrameBufferTest, SetAfterDecode) { SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, release_vp9_frame_buffer)); } + +TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) { + const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; + ASSERT_EQ(VPX_CODEC_OK, + SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, + release_vp9_frame_buffer)); + ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames()); + CheckFrameBufferRelease(); +} #endif // CONFIG_WEBM_IO VP9_INSTANTIATE_TEST_CASE( diff --git a/libvpx/test/fdct4x4_test.cc b/libvpx/test/fdct4x4_test.cc deleted file mode 100644 index aa90bfa18..000000000 --- a/libvpx/test/fdct4x4_test.cc +++ /dev/null @@ -1,511 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <math.h> -#include <stdlib.h> -#include <string.h> - -#include "third_party/googletest/src/include/gtest/gtest.h" - -#include "./vp9_rtcd.h" -#include "./vpx_dsp_rtcd.h" -#include "test/acm_random.h" -#include "test/clear_system_state.h" -#include "test/register_state_check.h" -#include "test/util.h" -#include "vp9/common/vp9_entropy.h" -#include "vpx/vpx_codec.h" -#include "vpx/vpx_integer.h" -#include "vpx_ports/mem.h" - -using libvpx_test::ACMRandom; - -namespace { -const int kNumCoeffs = 16; -typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride); -typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride); -typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride, - int tx_type); -typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, - int tx_type); - -typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct4x4Param; -typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht4x4Param; - -void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride, - int /*tx_type*/) { - vpx_fdct4x4_c(in, out, stride); -} - -void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) { - vp9_fht4x4_c(in, out, stride, tx_type); -} - -void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride, - int /*tx_type*/) { - vp9_fwht4x4_c(in, out, stride); -} - -#if CONFIG_VP9_HIGHBITDEPTH -void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); -} - -void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); -} - -void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10); -} - -void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12); -} - -void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); -} - -void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); -} - -#if HAVE_SSE2 -void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct4x4_16_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); -} - -void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct4x4_16_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); -} -#endif // HAVE_SSE2 -#endif // CONFIG_VP9_HIGHBITDEPTH - -class Trans4x4TestBase { - public: - virtual ~Trans4x4TestBase() {} - - protected: - virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0; - - virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0; - - void RunAccuracyCheck(int limit) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - uint32_t max_error = 0; - int64_t total_error = 0; - const int count_test_block = 10000; - for (int i = 0; i < count_test_block; ++i) { - DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]); - DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); - DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]); -#if CONFIG_VP9_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); - DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]); -#endif - - // Initialize a test block with input range [-255, 255]. - for (int j = 0; j < kNumCoeffs; ++j) { - if (bit_depth_ == VPX_BITS_8) { - src[j] = rnd.Rand8(); - dst[j] = rnd.Rand8(); - test_input_block[j] = src[j] - dst[j]; -#if CONFIG_VP9_HIGHBITDEPTH - } else { - src16[j] = rnd.Rand16() & mask_; - dst16[j] = rnd.Rand16() & mask_; - test_input_block[j] = src16[j] - dst16[j]; -#endif - } - } - - ASM_REGISTER_STATE_CHECK( - RunFwdTxfm(test_input_block, test_temp_block, pitch_)); - if (bit_depth_ == VPX_BITS_8) { - ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_)); -#if CONFIG_VP9_HIGHBITDEPTH - } else { - ASM_REGISTER_STATE_CHECK( - RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_)); -#endif - } - - for (int j = 0; j < kNumCoeffs; ++j) { -#if CONFIG_VP9_HIGHBITDEPTH - const int diff = - bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j]; -#else - ASSERT_EQ(VPX_BITS_8, bit_depth_); - const int diff = dst[j] - src[j]; -#endif - const uint32_t error = diff * diff; - if (max_error < error) max_error = error; - total_error += error; - } - } - - EXPECT_GE(static_cast<uint32_t>(limit), max_error) - << "Error: 4x4 FHT/IHT has an individual round trip error > " << limit; - - EXPECT_GE(count_test_block * limit, total_error) - << "Error: 4x4 FHT/IHT has average round trip error > " << limit - << " per block"; - } - - void RunCoeffCheck() { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - const int count_test_block = 5000; - DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]); - - for (int i = 0; i < count_test_block; ++i) { - // Initialize a test block with input range [-mask_, mask_]. - for (int j = 0; j < kNumCoeffs; ++j) { - input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_); - } - - fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_); - ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_)); - - // The minimum quant value is 4. - for (int j = 0; j < kNumCoeffs; ++j) - EXPECT_EQ(output_block[j], output_ref_block[j]); - } - } - - void RunMemCheck() { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - const int count_test_block = 5000; - DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]); - - for (int i = 0; i < count_test_block; ++i) { - // Initialize a test block with input range [-mask_, mask_]. - for (int j = 0; j < kNumCoeffs; ++j) { - input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_; - } - if (i == 0) { - for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = mask_; - } else if (i == 1) { - for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = -mask_; - } - - fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_); - ASM_REGISTER_STATE_CHECK( - RunFwdTxfm(input_extreme_block, output_block, pitch_)); - - // The minimum quant value is 4. - for (int j = 0; j < kNumCoeffs; ++j) { - EXPECT_EQ(output_block[j], output_ref_block[j]); - EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j])) - << "Error: 4x4 FDCT has coefficient larger than 4*DCT_MAX_VALUE"; - } - } - } - - void RunInvAccuracyCheck(int limit) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - const int count_test_block = 1000; - DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]); - DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); - DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]); -#if CONFIG_VP9_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); - DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]); -#endif - - for (int i = 0; i < count_test_block; ++i) { - // Initialize a test block with input range [-mask_, mask_]. - for (int j = 0; j < kNumCoeffs; ++j) { - if (bit_depth_ == VPX_BITS_8) { - src[j] = rnd.Rand8(); - dst[j] = rnd.Rand8(); - in[j] = src[j] - dst[j]; -#if CONFIG_VP9_HIGHBITDEPTH - } else { - src16[j] = rnd.Rand16() & mask_; - dst16[j] = rnd.Rand16() & mask_; - in[j] = src16[j] - dst16[j]; -#endif - } - } - - fwd_txfm_ref(in, coeff, pitch_, tx_type_); - - if (bit_depth_ == VPX_BITS_8) { - ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_)); -#if CONFIG_VP9_HIGHBITDEPTH - } else { - ASM_REGISTER_STATE_CHECK( - RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_)); -#endif - } - - for (int j = 0; j < kNumCoeffs; ++j) { -#if CONFIG_VP9_HIGHBITDEPTH - const int diff = - bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j]; -#else - const int diff = dst[j] - src[j]; -#endif - const uint32_t error = diff * diff; - EXPECT_GE(static_cast<uint32_t>(limit), error) - << "Error: 4x4 IDCT has error " << error << " at index " << j; - } - } - } - - int pitch_; - int tx_type_; - FhtFunc fwd_txfm_ref; - vpx_bit_depth_t bit_depth_; - int mask_; -}; - -class Trans4x4DCT : public Trans4x4TestBase, - public ::testing::TestWithParam<Dct4x4Param> { - public: - virtual ~Trans4x4DCT() {} - - virtual void SetUp() { - fwd_txfm_ = GET_PARAM(0); - inv_txfm_ = GET_PARAM(1); - tx_type_ = GET_PARAM(2); - pitch_ = 4; - fwd_txfm_ref = fdct4x4_ref; - bit_depth_ = GET_PARAM(3); - mask_ = (1 << bit_depth_) - 1; - } - virtual void TearDown() { libvpx_test::ClearSystemState(); } - - protected: - void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) { - fwd_txfm_(in, out, stride); - } - void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) { - inv_txfm_(out, dst, stride); - } - - FdctFunc fwd_txfm_; - IdctFunc inv_txfm_; -}; - -TEST_P(Trans4x4DCT, AccuracyCheck) { RunAccuracyCheck(1); } - -TEST_P(Trans4x4DCT, CoeffCheck) { RunCoeffCheck(); } - -TEST_P(Trans4x4DCT, MemCheck) { RunMemCheck(); } - -TEST_P(Trans4x4DCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); } - -class Trans4x4HT : public Trans4x4TestBase, - public ::testing::TestWithParam<Ht4x4Param> { - public: - virtual ~Trans4x4HT() {} - - virtual void SetUp() { - fwd_txfm_ = GET_PARAM(0); - inv_txfm_ = GET_PARAM(1); - tx_type_ = GET_PARAM(2); - pitch_ = 4; - fwd_txfm_ref = fht4x4_ref; - bit_depth_ = GET_PARAM(3); - mask_ = (1 << bit_depth_) - 1; - } - virtual void TearDown() { libvpx_test::ClearSystemState(); } - - protected: - void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) { - fwd_txfm_(in, out, stride, tx_type_); - } - - void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) { - inv_txfm_(out, dst, stride, tx_type_); - } - - FhtFunc fwd_txfm_; - IhtFunc inv_txfm_; -}; - -TEST_P(Trans4x4HT, AccuracyCheck) { RunAccuracyCheck(1); } - -TEST_P(Trans4x4HT, CoeffCheck) { RunCoeffCheck(); } - -TEST_P(Trans4x4HT, MemCheck) { RunMemCheck(); } - -TEST_P(Trans4x4HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); } - -class Trans4x4WHT : public Trans4x4TestBase, - public ::testing::TestWithParam<Dct4x4Param> { - public: - virtual ~Trans4x4WHT() {} - - virtual void SetUp() { - fwd_txfm_ = GET_PARAM(0); - inv_txfm_ = GET_PARAM(1); - tx_type_ = GET_PARAM(2); - pitch_ = 4; - fwd_txfm_ref = fwht4x4_ref; - bit_depth_ = GET_PARAM(3); - mask_ = (1 << bit_depth_) - 1; - } - virtual void TearDown() { libvpx_test::ClearSystemState(); } - - protected: - void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) { - fwd_txfm_(in, out, stride); - } - void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) { - inv_txfm_(out, dst, stride); - } - - FdctFunc fwd_txfm_; - IdctFunc inv_txfm_; -}; - -TEST_P(Trans4x4WHT, AccuracyCheck) { RunAccuracyCheck(0); } - -TEST_P(Trans4x4WHT, CoeffCheck) { RunCoeffCheck(); } - -TEST_P(Trans4x4WHT, MemCheck) { RunMemCheck(); } - -TEST_P(Trans4x4WHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); } -using std::tr1::make_tuple; - -#if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( - C, Trans4x4DCT, - ::testing::Values( - make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 0, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 0, VPX_BITS_12), - make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8))); -#else -INSTANTIATE_TEST_CASE_P(C, Trans4x4DCT, - ::testing::Values(make_tuple(&vpx_fdct4x4_c, - &vpx_idct4x4_16_add_c, 0, - VPX_BITS_8))); -#endif // CONFIG_VP9_HIGHBITDEPTH - -#if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( - C, Trans4x4HT, - ::testing::Values( - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 0, VPX_BITS_10), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 1, VPX_BITS_10), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 2, VPX_BITS_10), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 3, VPX_BITS_10), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 0, VPX_BITS_12), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 1, VPX_BITS_12), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 2, VPX_BITS_12), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 3, VPX_BITS_12), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8))); -#else -INSTANTIATE_TEST_CASE_P( - C, Trans4x4HT, - ::testing::Values( - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8))); -#endif // CONFIG_VP9_HIGHBITDEPTH - -#if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( - C, Trans4x4WHT, - ::testing::Values( - make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 0, VPX_BITS_10), - make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 0, VPX_BITS_12), - make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8))); -#else -INSTANTIATE_TEST_CASE_P(C, Trans4x4WHT, - ::testing::Values(make_tuple(&vp9_fwht4x4_c, - &vpx_iwht4x4_16_add_c, 0, - VPX_BITS_8))); -#endif // CONFIG_VP9_HIGHBITDEPTH - -#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(NEON, Trans4x4DCT, - ::testing::Values(make_tuple(&vpx_fdct4x4_neon, - &vpx_idct4x4_16_add_neon, - 0, VPX_BITS_8))); -#if !CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( - NEON, Trans4x4HT, - ::testing::Values( - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8))); -#endif // !CONFIG_VP9_HIGHBITDEPTH -#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE - -#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( - SSE2, Trans4x4WHT, - ::testing::Values( - make_tuple(&vp9_fwht4x4_sse2, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8), - make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0, VPX_BITS_8))); -#endif - -#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(SSE2, Trans4x4DCT, - ::testing::Values(make_tuple(&vpx_fdct4x4_sse2, - &vpx_idct4x4_16_add_sse2, - 0, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( - SSE2, Trans4x4HT, - ::testing::Values( - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8))); -#endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE - -#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( - SSE2, Trans4x4DCT, - ::testing::Values( - make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10_sse2, 0, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12_sse2, 0, VPX_BITS_12), - make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, VPX_BITS_12), - make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8))); - -INSTANTIATE_TEST_CASE_P( - SSE2, Trans4x4HT, - ::testing::Values( - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8))); -#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE - -#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(MSA, Trans4x4DCT, - ::testing::Values(make_tuple(&vpx_fdct4x4_msa, - &vpx_idct4x4_16_add_msa, 0, - VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( - MSA, Trans4x4HT, - ::testing::Values( - make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 3, VPX_BITS_8))); -#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -} // namespace diff --git a/libvpx/test/fdct8x8_test.cc b/libvpx/test/fdct8x8_test.cc index dfbb5dc3d..5021dda9b 100644 --- a/libvpx/test/fdct8x8_test.cc +++ b/libvpx/test/fdct8x8_test.cc @@ -511,8 +511,8 @@ class FwdTrans8x8TestBase { const int diff = dst[j] - ref[j]; #endif const uint32_t error = diff * diff; - EXPECT_EQ(0u, error) << "Error: 8x8 IDCT has error " << error - << " at index " << j; + EXPECT_EQ(0u, error) + << "Error: 8x8 IDCT has error " << error << " at index " << j; } } } @@ -739,7 +739,7 @@ INSTANTIATE_TEST_CASE_P( !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P(SSSE3, FwdTrans8x8DCT, ::testing::Values(make_tuple(&vpx_fdct8x8_ssse3, - &vpx_idct8x8_64_add_ssse3, + &vpx_idct8x8_64_add_sse2, 0, VPX_BITS_8))); #endif @@ -756,4 +756,11 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 2, VPX_BITS_8), make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 3, VPX_BITS_8))); #endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_CASE_P(VSX, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_c, + &vpx_idct8x8_64_add_vsx, 0, + VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/libvpx/test/hadamard_test.cc b/libvpx/test/hadamard_test.cc index a55b15ad0..3b7cfeddc 100644 --- a/libvpx/test/hadamard_test.cc +++ b/libvpx/test/hadamard_test.cc @@ -22,7 +22,8 @@ namespace { using ::libvpx_test::ACMRandom; -typedef void (*HadamardFunc)(const int16_t *a, int a_stride, tran_low_t *b); +typedef void (*HadamardFunc)(const int16_t *a, ptrdiff_t a_stride, + tran_low_t *b); void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) { int16_t b[8]; @@ -268,6 +269,11 @@ INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test, ::testing::Values(&vpx_hadamard_16x16_sse2)); #endif // HAVE_SSE2 +#if HAVE_AVX2 +INSTANTIATE_TEST_CASE_P(AVX2, Hadamard16x16Test, + ::testing::Values(&vpx_hadamard_16x16_avx2)); +#endif // HAVE_AVX2 + #if HAVE_VSX INSTANTIATE_TEST_CASE_P(VSX, Hadamard16x16Test, ::testing::Values(&vpx_hadamard_16x16_vsx)); diff --git a/libvpx/test/idct_test.cc b/libvpx/test/idct_test.cc index 084b2ed0c..3700374d7 100644 --- a/libvpx/test/idct_test.cc +++ b/libvpx/test/idct_test.cc @@ -30,12 +30,15 @@ class IDCTTest : public ::testing::TestWithParam<IdctFunc> { virtual void SetUp() { UUT = GetParam(); - input = new (std::nothrow) Buffer<int16_t>(4, 4, 0); + input = new Buffer<int16_t>(4, 4, 0); ASSERT_TRUE(input != NULL); - predict = new (std::nothrow) Buffer<uint8_t>(4, 4, 3); + ASSERT_TRUE(input->Init()); + predict = new Buffer<uint8_t>(4, 4, 3); ASSERT_TRUE(predict != NULL); - output = new (std::nothrow) Buffer<uint8_t>(4, 4, 3); + ASSERT_TRUE(predict->Init()); + output = new Buffer<uint8_t>(4, 4, 3); ASSERT_TRUE(output != NULL); + ASSERT_TRUE(output->Init()); } virtual void TearDown() { @@ -166,4 +169,9 @@ INSTANTIATE_TEST_CASE_P(MMX, IDCTTest, INSTANTIATE_TEST_CASE_P(MSA, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_msa)); #endif // HAVE_MSA + +#if HAVE_MMI +INSTANTIATE_TEST_CASE_P(MMI, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_mmi)); +#endif // HAVE_MMI } diff --git a/libvpx/test/invalid_file_test.cc b/libvpx/test/invalid_file_test.cc index eae81faa1..79220b0f6 100644 --- a/libvpx/test/invalid_file_test.cc +++ b/libvpx/test/invalid_file_test.cc @@ -45,8 +45,8 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest, void OpenResFile(const std::string &res_file_name_) { res_file_ = libvpx_test::OpenTestDataFile(res_file_name_); - ASSERT_TRUE(res_file_ != NULL) << "Result file open failed. Filename: " - << res_file_name_; + ASSERT_TRUE(res_file_ != NULL) + << "Result file open failed. Filename: " << res_file_name_; } virtual bool HandleDecodeResult( @@ -120,11 +120,23 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest, TEST_P(InvalidFileTest, ReturnCode) { RunTest(); } +#if CONFIG_VP8_DECODER +const DecodeParam kVP8InvalidFileTests[] = { + { 1, "invalid-bug-1443.ivf" }, +}; + +VP8_INSTANTIATE_TEST_CASE(InvalidFileTest, + ::testing::ValuesIn(kVP8InvalidFileTests)); +#endif // CONFIG_VP8_DECODER + #if CONFIG_VP9_DECODER const DecodeParam kVP9InvalidFileTests[] = { { 1, "invalid-vp90-02-v2.webm" }, #if CONFIG_VP9_HIGHBITDEPTH { 1, "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf" }, + { 1, + "invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-." + "ivf" }, #endif { 1, "invalid-vp90-03-v3.webm" }, { 1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf" }, @@ -164,12 +176,12 @@ class InvalidFileInvalidPeekTest : public InvalidFileTest { TEST_P(InvalidFileInvalidPeekTest, ReturnCode) { RunTest(); } #if CONFIG_VP8_DECODER -const DecodeParam kVP8InvalidFileTests[] = { +const DecodeParam kVP8InvalidPeekTests[] = { { 1, "invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf" }, }; VP8_INSTANTIATE_TEST_CASE(InvalidFileInvalidPeekTest, - ::testing::ValuesIn(kVP8InvalidFileTests)); + ::testing::ValuesIn(kVP8InvalidPeekTests)); #endif // CONFIG_VP8_DECODER #if CONFIG_VP9_DECODER diff --git a/libvpx/test/ivf_video_source.h b/libvpx/test/ivf_video_source.h index b87624a11..5862d2649 100644 --- a/libvpx/test/ivf_video_source.h +++ b/libvpx/test/ivf_video_source.h @@ -47,8 +47,8 @@ class IVFVideoSource : public CompressedVideoSource { virtual void Begin() { input_file_ = OpenTestDataFile(file_name_); - ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: " - << file_name_; + ASSERT_TRUE(input_file_ != NULL) + << "Input file open failed. Filename: " << file_name_; // Read file header uint8_t file_hdr[kIvfFileHdrSize]; diff --git a/libvpx/test/keyframe_test.cc b/libvpx/test/keyframe_test.cc index 38bd923b7..ee75f401c 100644 --- a/libvpx/test/keyframe_test.cc +++ b/libvpx/test/keyframe_test.cc @@ -135,8 +135,8 @@ TEST_P(KeyframeTest, TestAutoKeyframe) { for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin(); iter != kf_pts_list_.end(); ++iter) { if (deadline_ == VPX_DL_REALTIME && *iter > 0) - EXPECT_EQ(0, (*iter - 1) % 30) << "Unexpected keyframe at frame " - << *iter; + EXPECT_EQ(0, (*iter - 1) % 30) + << "Unexpected keyframe at frame " << *iter; else EXPECT_EQ(0, *iter % 30) << "Unexpected keyframe at frame " << *iter; } diff --git a/libvpx/test/level_test.cc b/libvpx/test/level_test.cc index 85097e94b..26935a81b 100644 --- a/libvpx/test/level_test.cc +++ b/libvpx/test/level_test.cc @@ -73,7 +73,7 @@ TEST_P(LevelTest, TestTargetLevel11Large) { target_level_ = 11; cfg_.rc_target_bitrate = 150; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_EQ(target_level_, level_); + ASSERT_GE(target_level_, level_); } TEST_P(LevelTest, TestTargetLevel20Large) { @@ -83,7 +83,7 @@ TEST_P(LevelTest, TestTargetLevel20Large) { target_level_ = 20; cfg_.rc_target_bitrate = 1200; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_EQ(target_level_, level_); + ASSERT_GE(target_level_, level_); } TEST_P(LevelTest, TestTargetLevel31Large) { @@ -93,7 +93,7 @@ TEST_P(LevelTest, TestTargetLevel31Large) { target_level_ = 31; cfg_.rc_target_bitrate = 8000; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_EQ(target_level_, level_); + ASSERT_GE(target_level_, level_); } // Test for keeping level stats only @@ -103,11 +103,11 @@ TEST_P(LevelTest, TestTargetLevel0) { target_level_ = 0; min_gf_internal_ = 4; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_EQ(11, level_); + ASSERT_GE(11, level_); cfg_.rc_target_bitrate = 1600; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_EQ(20, level_); + ASSERT_GE(20, level_); } // Test for level control being turned off @@ -130,7 +130,7 @@ TEST_P(LevelTest, TestTargetLevelApi) { if (level == 10 || level == 11 || level == 20 || level == 21 || level == 30 || level == 31 || level == 40 || level == 41 || level == 50 || level == 51 || level == 52 || level == 60 || - level == 61 || level == 62 || level == 0 || level == 255) + level == 61 || level == 62 || level == 0 || level == 1 || level == 255) EXPECT_EQ(VPX_CODEC_OK, vpx_codec_control(&enc, VP9E_SET_TARGET_LEVEL, level)); else diff --git a/libvpx/test/lpf_test.cc b/libvpx/test/lpf_test.cc index 4fca7d49c..e04b996cd 100644 --- a/libvpx/test/lpf_test.cc +++ b/libvpx/test/lpf_test.cc @@ -114,6 +114,18 @@ void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit, } } +uint8_t GetOuterThresh(ACMRandom *rnd) { + return static_cast<uint8_t>(rnd->RandRange(3 * MAX_LOOP_FILTER + 5)); +} + +uint8_t GetInnerThresh(ACMRandom *rnd) { + return static_cast<uint8_t>(rnd->RandRange(MAX_LOOP_FILTER + 1)); +} + +uint8_t GetHevThresh(ACMRandom *rnd) { + return static_cast<uint8_t>(rnd->RandRange(MAX_LOOP_FILTER + 1) >> 4); +} + class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> { public: virtual ~Loop8Test6Param() {} @@ -162,15 +174,15 @@ TEST_P(Loop8Test6Param, OperationCheck) { int first_failure = -1; for (int i = 0; i < count_test_block; ++i) { int err_count = 0; - uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4)); + uint8_t tmp = GetOuterThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER)); + tmp = GetInnerThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = rnd.Rand8(); + tmp = GetHevThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; @@ -221,15 +233,15 @@ TEST_P(Loop8Test6Param, ValueCheck) { for (int i = 0; i < count_test_block; ++i) { int err_count = 0; - uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4)); + uint8_t tmp = GetOuterThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER)); + tmp = GetInnerThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = rnd.Rand8(); + tmp = GetHevThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; @@ -271,27 +283,27 @@ TEST_P(Loop8Test9Param, OperationCheck) { int first_failure = -1; for (int i = 0; i < count_test_block; ++i) { int err_count = 0; - uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4)); + uint8_t tmp = GetOuterThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER)); + tmp = GetInnerThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = rnd.Rand8(); + tmp = GetHevThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4)); + tmp = GetOuterThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER)); + tmp = GetInnerThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = rnd.Rand8(); + tmp = GetHevThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; @@ -334,27 +346,27 @@ TEST_P(Loop8Test9Param, ValueCheck) { int first_failure = -1; for (int i = 0; i < count_test_block; ++i) { int err_count = 0; - uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4)); + uint8_t tmp = GetOuterThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER)); + tmp = GetInnerThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = rnd.Rand8(); + tmp = GetHevThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4)); + tmp = GetOuterThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER)); + tmp = GetInnerThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = rnd.Rand8(); + tmp = GetHevThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; diff --git a/libvpx/test/minmax_test.cc b/libvpx/test/minmax_test.cc index e5c93ed7d..9c119116a 100644 --- a/libvpx/test/minmax_test.cc +++ b/libvpx/test/minmax_test.cc @@ -107,10 +107,10 @@ TEST_P(MinMaxTest, CompareReferenceAndVaryStride) { int min_ref, max_ref, min, max; reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref); ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max)); - EXPECT_EQ(max_ref, max) << "when a_stride = " << a_stride - << " and b_stride = " << b_stride; - EXPECT_EQ(min_ref, min) << "when a_stride = " << a_stride - << " and b_stride = " << b_stride; + EXPECT_EQ(max_ref, max) + << "when a_stride = " << a_stride << " and b_stride = " << b_stride; + EXPECT_EQ(min_ref, min) + << "when a_stride = " << a_stride << " and b_stride = " << b_stride; } } } diff --git a/libvpx/test/partial_idct_test.cc b/libvpx/test/partial_idct_test.cc index 740d7e202..f7b50f53a 100644 --- a/libvpx/test/partial_idct_test.cc +++ b/libvpx/test/partial_idct_test.cc @@ -62,9 +62,9 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> { virtual ~PartialIDctTest() {} virtual void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); - ftxfm_ = GET_PARAM(0); - full_itxfm_ = GET_PARAM(1); - partial_itxfm_ = GET_PARAM(2); + fwd_txfm_ = GET_PARAM(0); + full_inv_txfm_ = GET_PARAM(1); + partial_inv_txfm_ = GET_PARAM(2); tx_size_ = GET_PARAM(3); last_nonzero_ = GET_PARAM(4); bit_depth_ = GET_PARAM(5); @@ -128,12 +128,12 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> { } void InitInput() { - const int max_coeff = (32766 << (bit_depth_ - 8)) / 4; - int max_energy_leftover = max_coeff * max_coeff; + const int64_t max_coeff = (32766 << (bit_depth_ - 8)) / 4; + int64_t max_energy_leftover = max_coeff * max_coeff; for (int j = 0; j < last_nonzero_; ++j) { tran_low_t coeff = static_cast<tran_low_t>( sqrt(1.0 * max_energy_leftover) * (rnd_.Rand16() - 32768) / 65536); - max_energy_leftover -= coeff * coeff; + max_energy_leftover -= static_cast<int64_t>(coeff) * coeff; if (max_energy_leftover < 0) { max_energy_leftover = 0; coeff = 0; @@ -161,6 +161,14 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> { } } } + + printf("\ninput_block_:\n"); + for (int y = 0; y < size_; y++) { + for (int x = 0; x < size_; x++) { + printf("%6d,", input_block_[y * size_ + x]); + } + printf("\n"); + } } } @@ -177,9 +185,9 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> { int output_block_size_; int bit_depth_; int mask_; - FwdTxfmFunc ftxfm_; - InvTxfmWithBdFunc full_itxfm_; - InvTxfmWithBdFunc partial_itxfm_; + FwdTxfmFunc fwd_txfm_; + InvTxfmWithBdFunc full_inv_txfm_; + InvTxfmWithBdFunc partial_inv_txfm_; ACMRandom rnd_; }; @@ -213,7 +221,7 @@ TEST_P(PartialIDctTest, RunQuantCheck) { } } - ftxfm_(input_extreme_block, output_ref_block, size_); + fwd_txfm_(input_extreme_block, output_ref_block, size_); // quantization with minimum allowed step sizes input_block_[0] = (output_ref_block[0] / 4) * 4; @@ -223,9 +231,9 @@ TEST_P(PartialIDctTest, RunQuantCheck) { } ASM_REGISTER_STATE_CHECK( - full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_)); + full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_)); ASM_REGISTER_STATE_CHECK( - partial_itxfm_(input_block_, output_block_, stride_, bit_depth_)); + partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_)); ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, pixel_size_ * output_block_size_)) << "Error: partial inverse transform produces different results"; @@ -238,9 +246,9 @@ TEST_P(PartialIDctTest, ResultsMatch) { InitInput(); ASM_REGISTER_STATE_CHECK( - full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_)); + full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_)); ASM_REGISTER_STATE_CHECK( - partial_itxfm_(input_block_, output_block_, stride_, bit_depth_)); + partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_)); ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, pixel_size_ * output_block_size_)) << "Error: partial inverse transform produces different results"; @@ -255,9 +263,9 @@ TEST_P(PartialIDctTest, AddOutputBlock) { } ASM_REGISTER_STATE_CHECK( - full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_)); + full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_)); ASM_REGISTER_STATE_CHECK( - partial_itxfm_(input_block_, output_block_, stride_, bit_depth_)); + partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_)); ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, pixel_size_ * output_block_size_)) << "Error: Transform results are not correctly added to output."; @@ -278,9 +286,9 @@ TEST_P(PartialIDctTest, SingleExtremeCoeff) { input_block_[vp9_default_scan_orders[tx_size_].scan[i]] = coeff; ASM_REGISTER_STATE_CHECK( - full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_)); + full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_)); ASM_REGISTER_STATE_CHECK( - partial_itxfm_(input_block_, output_block_, stride_, bit_depth_)); + partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_)); ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, pixel_size_ * output_block_size_)) << "Error: Fails with single coeff of " << coeff << " at " << i @@ -297,12 +305,12 @@ TEST_P(PartialIDctTest, DISABLED_Speed) { for (int i = 0; i < kCountSpeedTestBlock; ++i) { ASM_REGISTER_STATE_CHECK( - full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_)); + full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_)); } vpx_usec_timer timer; vpx_usec_timer_start(&timer); for (int i = 0; i < kCountSpeedTestBlock; ++i) { - partial_itxfm_(input_block_, output_block_, stride_, bit_depth_); + partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_); } libvpx_test::ClearSystemState(); vpx_usec_timer_mark(&timer); @@ -469,7 +477,9 @@ const PartialInvTxfmParam c_partial_idct_tests[] = { INSTANTIATE_TEST_CASE_P(C, PartialIDctTest, ::testing::ValuesIn(c_partial_idct_tests)); -#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE +#if !CONFIG_EMULATE_HARDWARE + +#if HAVE_NEON const PartialInvTxfmParam neon_partial_idct_tests[] = { #if CONFIG_VP9_HIGHBITDEPTH make_tuple(&vpx_highbd_fdct32x32_c, @@ -617,12 +627,42 @@ const PartialInvTxfmParam neon_partial_idct_tests[] = { INSTANTIATE_TEST_CASE_P(NEON, PartialIDctTest, ::testing::ValuesIn(neon_partial_idct_tests)); -#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE +#endif // HAVE_NEON -#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE +#if HAVE_SSE2 // 32x32_135_ is implemented using the 1024 version. const PartialInvTxfmParam sse2_partial_idct_tests[] = { #if CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>, + &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse2>, TX_32X32, + 1024, 8, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>, + &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse2>, TX_32X32, + 1024, 10, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>, + &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse2>, TX_32X32, + 1024, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>, + &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse2>, TX_32X32, 135, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>, + &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse2>, TX_32X32, 135, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>, + &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse2>, TX_32X32, 135, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>, + &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse2>, TX_32X32, 34, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>, + &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse2>, TX_32X32, 34, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>, + &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse2>, TX_32X32, 34, 12, 2), make_tuple( &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>, &highbd_wrapper<vpx_highbd_idct32x32_1_add_sse2>, TX_32X32, 1, 8, 2), @@ -642,6 +682,15 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = { &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>, &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse2>, TX_16X16, 256, 12, 2), make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse2>, TX_16X16, 38, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse2>, TX_16X16, 38, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse2>, TX_16X16, 38, 12, 2), + make_tuple( &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>, &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 8, 2), make_tuple( @@ -701,12 +750,16 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = { #endif // CONFIG_VP9_HIGHBITDEPTH make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>, &wrapper<vpx_idct32x32_1024_add_sse2>, TX_32X32, 1024, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_135_add_c>, + &wrapper<vpx_idct32x32_135_add_sse2>, TX_32X32, 135, 8, 1), make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>, &wrapper<vpx_idct32x32_34_add_sse2>, TX_32X32, 34, 8, 1), make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1_add_c>, &wrapper<vpx_idct32x32_1_add_sse2>, TX_32X32, 1, 8, 1), make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>, &wrapper<vpx_idct16x16_256_add_sse2>, TX_16X16, 256, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_38_add_c>, + &wrapper<vpx_idct16x16_38_add_sse2>, TX_16X16, 38, 8, 1), make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_10_add_c>, &wrapper<vpx_idct16x16_10_add_sse2>, TX_16X16, 10, 8, 1), make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_1_add_c>, @@ -726,27 +779,121 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = { INSTANTIATE_TEST_CASE_P(SSE2, PartialIDctTest, ::testing::ValuesIn(sse2_partial_idct_tests)); -#endif // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE +#endif // HAVE_SSE2 -#if HAVE_SSSE3 && !CONFIG_EMULATE_HARDWARE +#if HAVE_SSSE3 const PartialInvTxfmParam ssse3_partial_idct_tests[] = { - make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>, - &wrapper<vpx_idct32x32_1024_add_ssse3>, TX_32X32, 1024, 8, 1), make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_135_add_c>, &wrapper<vpx_idct32x32_135_add_ssse3>, TX_32X32, 135, 8, 1), make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>, &wrapper<vpx_idct32x32_34_add_ssse3>, TX_32X32, 34, 8, 1), - make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>, - &wrapper<vpx_idct8x8_64_add_ssse3>, TX_8X8, 64, 8, 1), make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_12_add_c>, &wrapper<vpx_idct8x8_12_add_ssse3>, TX_8X8, 12, 8, 1) }; INSTANTIATE_TEST_CASE_P(SSSE3, PartialIDctTest, ::testing::ValuesIn(ssse3_partial_idct_tests)); -#endif // HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE +#endif // HAVE_SSSE3 -#if HAVE_DSPR2 && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH +const PartialInvTxfmParam sse4_1_partial_idct_tests[] = { + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>, + &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse4_1>, TX_32X32, + 1024, 8, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>, + &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse4_1>, TX_32X32, + 1024, 10, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>, + &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse4_1>, TX_32X32, + 1024, 12, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>, + &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse4_1>, TX_32X32, + 135, 8, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>, + &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse4_1>, TX_32X32, + 135, 10, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>, + &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse4_1>, TX_32X32, + 135, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>, + &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse4_1>, TX_32X32, 34, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>, + &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse4_1>, TX_32X32, 34, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>, + &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse4_1>, TX_32X32, 34, 12, 2), + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16, + 256, 8, 2), + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16, + 256, 10, 2), + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16, + 256, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 12, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>, + &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse4_1>, TX_8X8, 64, 8, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>, + &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse4_1>, TX_8X8, 64, 10, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>, + &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse4_1>, TX_8X8, 64, 12, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>, + &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse4_1>, TX_8X8, 12, 8, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>, + &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse4_1>, TX_8X8, 12, 10, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>, + &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse4_1>, TX_8X8, 12, 12, 2), + make_tuple( + &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>, + &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 8, 2), + make_tuple( + &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>, + &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 10, 2), + make_tuple( + &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>, + &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 12, 2) +}; + +INSTANTIATE_TEST_CASE_P(SSE4_1, PartialIDctTest, + ::testing::ValuesIn(sse4_1_partial_idct_tests)); +#endif // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH const PartialInvTxfmParam dspr2_partial_idct_tests[] = { make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>, &wrapper<vpx_idct32x32_1024_add_dspr2>, TX_32X32, 1024, 8, 1), @@ -774,9 +921,9 @@ const PartialInvTxfmParam dspr2_partial_idct_tests[] = { INSTANTIATE_TEST_CASE_P(DSPR2, PartialIDctTest, ::testing::ValuesIn(dspr2_partial_idct_tests)); -#endif // HAVE_DSPR2 && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH -#if HAVE_MSA && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH // 32x32_135_ is implemented using the 1024 version. const PartialInvTxfmParam msa_partial_idct_tests[] = { make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>, @@ -805,6 +952,8 @@ const PartialInvTxfmParam msa_partial_idct_tests[] = { INSTANTIATE_TEST_CASE_P(MSA, PartialIDctTest, ::testing::ValuesIn(msa_partial_idct_tests)); -#endif // HAVE_MSA && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH + +#endif // !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/libvpx/test/pp_filter_test.cc b/libvpx/test/pp_filter_test.cc index 95da09c31..5a2ade1ef 100644 --- a/libvpx/test/pp_filter_test.cc +++ b/libvpx/test/pp_filter_test.cc @@ -57,12 +57,14 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) { // 5-tap filter needs 2 padding rows above and below the block in the input. Buffer<uint8_t> src_image = Buffer<uint8_t>(block_width, block_height, 2); + ASSERT_TRUE(src_image.Init()); // Filter extends output block by 8 samples at left and right edges. // Though the left padding is only 8 bytes, the assembly code tries to // read 16 bytes before the pointer. Buffer<uint8_t> dst_image = Buffer<uint8_t>(block_width, block_height, 8, 16, 8, 8); + ASSERT_TRUE(dst_image.Init()); uint8_t *const flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width)); @@ -88,8 +90,8 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) { uint8_t *pixel_ptr = dst_image.TopLeftPixel(); for (int i = 0; i < block_height; ++i) { for (int j = 0; j < block_width; ++j) { - ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j]) << "at (" << i << ", " << j - << ")"; + ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j]) + << "at (" << i << ", " << j << ")"; } pixel_ptr += dst_image.stride(); } @@ -108,6 +110,7 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) { // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16. Buffer<uint8_t> src_image = Buffer<uint8_t>(block_width, block_height, 2, 2, 10, 2); + ASSERT_TRUE(src_image.Init()); // Filter extends output block by 8 samples at left and right edges. // Though the left padding is only 8 bytes, there is 'above' padding as well @@ -116,7 +119,9 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) { // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16. Buffer<uint8_t> dst_image = Buffer<uint8_t>(block_width, block_height, 8, 8, 16, 8); + ASSERT_TRUE(dst_image.Init()); Buffer<uint8_t> dst_image_ref = Buffer<uint8_t>(block_width, block_height, 8); + ASSERT_TRUE(dst_image_ref.Init()); // Filter values are set in blocks of 16 for Y and 8 for U/V. Each macroblock // can have a different filter. SSE2 assembly reads flimits in blocks of 16 so @@ -177,8 +182,8 @@ class VpxMbPostProcAcrossIpTest int rows, int cols, int src_pitch) { for (int r = 0; r < rows; r++) { for (int c = 0; c < cols; c++) { - ASSERT_EQ(expected_output[c], src_c[c]) << "at (" << r << ", " << c - << ")"; + ASSERT_EQ(expected_output[c], src_c[c]) + << "at (" << r << ", " << c << ")"; } src_c += src_pitch; } @@ -197,10 +202,12 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckLowFilterOutput) { const int cols = 16; Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8); + ASSERT_TRUE(src.Init()); src.SetPadding(10); SetCols(src.TopLeftPixel(), rows, cols, src.stride()); Buffer<uint8_t> expected_output = Buffer<uint8_t>(cols, rows, 0); + ASSERT_TRUE(expected_output.Init()); SetCols(expected_output.TopLeftPixel(), rows, cols, expected_output.stride()); RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), q2mbl(0), @@ -212,6 +219,7 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckMediumFilterOutput) { const int cols = 16; Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8); + ASSERT_TRUE(src.Init()); src.SetPadding(10); SetCols(src.TopLeftPixel(), rows, cols, src.stride()); @@ -228,6 +236,7 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckHighFilterOutput) { const int cols = 16; Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8); + ASSERT_TRUE(src.Init()); src.SetPadding(10); SetCols(src.TopLeftPixel(), rows, cols, src.stride()); @@ -249,7 +258,9 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckCvsAssembly) { const int cols = 16; Buffer<uint8_t> c_mem = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8); + ASSERT_TRUE(c_mem.Init()); Buffer<uint8_t> asm_mem = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8); + ASSERT_TRUE(asm_mem.Init()); // When level >= 100, the filter behaves the same as the level = INT_MAX // When level < 20, it behaves the same as the level = 0 @@ -285,8 +296,8 @@ class VpxMbPostProcDownTest int rows, int cols, int src_pitch) { for (int r = 0; r < rows; r++) { for (int c = 0; c < cols; c++) { - ASSERT_EQ(expected_output[r * rows + c], src_c[c]) << "at (" << r - << ", " << c << ")"; + ASSERT_EQ(expected_output[r * rows + c], src_c[c]) + << "at (" << r << ", " << c << ")"; } src_c += src_pitch; } @@ -305,6 +316,7 @@ TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) { const int cols = 16; Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17); + ASSERT_TRUE(src_c.Init()); src_c.SetPadding(10); SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride()); @@ -340,6 +352,7 @@ TEST_P(VpxMbPostProcDownTest, CheckMediumFilterOutput) { const int cols = 16; Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17); + ASSERT_TRUE(src_c.Init()); src_c.SetPadding(10); SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride()); @@ -370,6 +383,7 @@ TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) { const int cols = 16; Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17); + ASSERT_TRUE(src_c.Init()); src_c.SetPadding(10); SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride()); @@ -392,7 +406,9 @@ TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) { rnd.Reset(ACMRandom::DeterministicSeed()); Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17); + ASSERT_TRUE(src_c.Init()); Buffer<uint8_t> src_asm = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17); + ASSERT_TRUE(src_asm.Init()); for (int level = 0; level < 100; level++) { src_c.SetPadding(10); diff --git a/libvpx/test/predict_test.cc b/libvpx/test/predict_test.cc index a6e2b3cf3..9f366ae52 100644 --- a/libvpx/test/predict_test.cc +++ b/libvpx/test/predict_test.cc @@ -324,6 +324,15 @@ INSTANTIATE_TEST_CASE_P( make_tuple(4, 4, &vp8_sixtap_predict4x4_msa))); #endif +#if HAVE_MMI +INSTANTIATE_TEST_CASE_P( + MMI, SixtapPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmi), + make_tuple(8, 8, &vp8_sixtap_predict8x8_mmi), + make_tuple(8, 4, &vp8_sixtap_predict8x4_mmi), + make_tuple(4, 4, &vp8_sixtap_predict4x4_mmi))); +#endif + class BilinearPredictTest : public PredictTestBase {}; TEST_P(BilinearPredictTest, TestWithRandomData) { diff --git a/libvpx/test/quantize_test.cc b/libvpx/test/quantize_test.cc index 69da8994c..40bb2642e 100644 --- a/libvpx/test/quantize_test.cc +++ b/libvpx/test/quantize_test.cc @@ -200,4 +200,12 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp8_fast_quantize_b_msa, &vp8_fast_quantize_b_c), make_tuple(&vp8_regular_quantize_b_msa, &vp8_regular_quantize_b_c))); #endif // HAVE_MSA + +#if HAVE_MMI +INSTANTIATE_TEST_CASE_P( + MMI, QuantizeTest, + ::testing::Values( + make_tuple(&vp8_fast_quantize_b_mmi, &vp8_fast_quantize_b_c), + make_tuple(&vp8_regular_quantize_b_mmi, &vp8_regular_quantize_b_c))); +#endif // HAVE_MMI } // namespace diff --git a/libvpx/test/register_state_check.h b/libvpx/test/register_state_check.h index 84641c8e9..a779e5c06 100644 --- a/libvpx/test/register_state_check.h +++ b/libvpx/test/register_state_check.h @@ -113,8 +113,8 @@ class RegisterStateCheck { int64_t post_store[8]; vpx_push_neon(post_store); for (int i = 0; i < 8; ++i) { - EXPECT_EQ(pre_store_[i], post_store[i]) << "d" << i + 8 - << " has been modified"; + EXPECT_EQ(pre_store_[i], post_store[i]) + << "d" << i + 8 << " has been modified"; } } diff --git a/libvpx/test/resize_test.cc b/libvpx/test/resize_test.cc index c9950dd43..e95dc6651 100644 --- a/libvpx/test/resize_test.cc +++ b/libvpx/test/resize_test.cc @@ -298,10 +298,10 @@ TEST_P(ResizeTest, TestExternalResizeWorks) { unsigned int expected_h; ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w, &expected_h, 0); - EXPECT_EQ(expected_w, info->w) << "Frame " << frame - << " had unexpected width"; - EXPECT_EQ(expected_h, info->h) << "Frame " << frame - << " had unexpected height"; + EXPECT_EQ(expected_w, info->w) + << "Frame " << frame << " had unexpected width"; + EXPECT_EQ(expected_h, info->h) + << "Frame " << frame << " had unexpected height"; } } @@ -513,10 +513,10 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) { unsigned int expected_h; ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w, &expected_h, 1); - EXPECT_EQ(expected_w, info->w) << "Frame " << frame - << " had unexpected width"; - EXPECT_EQ(expected_h, info->h) << "Frame " << frame - << " had unexpected height"; + EXPECT_EQ(expected_w, info->w) + << "Frame " << frame << " had unexpected width"; + EXPECT_EQ(expected_h, info->h) + << "Frame " << frame << " had unexpected height"; EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames()); } } diff --git a/libvpx/test/sad_test.cc b/libvpx/test/sad_test.cc index fe3983eb7..67c3c5315 100644 --- a/libvpx/test/sad_test.cc +++ b/libvpx/test/sad_test.cc @@ -644,19 +644,50 @@ INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests)); #if HAVE_NEON const SadMxNParam neon_tests[] = { SadMxNParam(64, 64, &vpx_sad64x64_neon), + SadMxNParam(64, 32, &vpx_sad64x32_neon), SadMxNParam(32, 32, &vpx_sad32x32_neon), + SadMxNParam(16, 32, &vpx_sad16x32_neon), SadMxNParam(16, 16, &vpx_sad16x16_neon), SadMxNParam(16, 8, &vpx_sad16x8_neon), SadMxNParam(8, 16, &vpx_sad8x16_neon), SadMxNParam(8, 8, &vpx_sad8x8_neon), + SadMxNParam(8, 4, &vpx_sad8x4_neon), + SadMxNParam(4, 8, &vpx_sad4x8_neon), SadMxNParam(4, 4, &vpx_sad4x4_neon), }; INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests)); +const SadMxNAvgParam avg_neon_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon), + SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_neon), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_neon), + SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_neon), + SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_neon), + SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_neon), + SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_neon), + SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_neon), + SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_neon), + SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_neon), + SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_neon), + SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_neon), +}; +INSTANTIATE_TEST_CASE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests)); + const SadMxNx4Param x4d_neon_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_neon), SadMxNx4Param(32, 32, &vpx_sad32x32x4d_neon), + SadMxNx4Param(32, 16, &vpx_sad32x16x4d_neon), + SadMxNx4Param(16, 32, &vpx_sad16x32x4d_neon), SadMxNx4Param(16, 16, &vpx_sad16x16x4d_neon), + SadMxNx4Param(16, 8, &vpx_sad16x8x4d_neon), + SadMxNx4Param(8, 16, &vpx_sad8x16x4d_neon), + SadMxNx4Param(8, 8, &vpx_sad8x8x4d_neon), + SadMxNx4Param(8, 4, &vpx_sad8x4x4d_neon), + SadMxNx4Param(4, 8, &vpx_sad4x8x4d_neon), + SadMxNx4Param(4, 4, &vpx_sad4x4x4d_neon), }; INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests)); #endif // HAVE_NEON @@ -865,6 +896,14 @@ const SadMxNx4Param x4d_avx2_tests[] = { INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests)); #endif // HAVE_AVX2 +#if HAVE_AVX512 +const SadMxNx4Param x4d_avx512_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx512), +}; +INSTANTIATE_TEST_CASE_P(AVX512, SADx4Test, + ::testing::ValuesIn(x4d_avx512_tests)); +#endif // HAVE_AVX512 + //------------------------------------------------------------------------------ // MIPS functions #if HAVE_MSA @@ -934,5 +973,84 @@ const SadMxNParam vsx_tests[] = { SadMxNParam(16, 8, &vpx_sad16x8_vsx), }; INSTANTIATE_TEST_CASE_P(VSX, SADTest, ::testing::ValuesIn(vsx_tests)); + +const SadMxNAvgParam avg_vsx_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_vsx), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_vsx), + SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_vsx), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_vsx), + SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_vsx), + SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_vsx), + SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_vsx), + SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_vsx), +}; +INSTANTIATE_TEST_CASE_P(VSX, SADavgTest, ::testing::ValuesIn(avg_vsx_tests)); + +const SadMxNx4Param x4d_vsx_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_vsx), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_vsx), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_vsx), + SadMxNx4Param(32, 32, &vpx_sad32x32x4d_vsx), + SadMxNx4Param(32, 16, &vpx_sad32x16x4d_vsx), + SadMxNx4Param(16, 32, &vpx_sad16x32x4d_vsx), + SadMxNx4Param(16, 16, &vpx_sad16x16x4d_vsx), + SadMxNx4Param(16, 8, &vpx_sad16x8x4d_vsx), +}; +INSTANTIATE_TEST_CASE_P(VSX, SADx4Test, ::testing::ValuesIn(x4d_vsx_tests)); #endif // HAVE_VSX + +//------------------------------------------------------------------------------ +// Loongson functions +#if HAVE_MMI +const SadMxNParam mmi_tests[] = { + SadMxNParam(64, 64, &vpx_sad64x64_mmi), + SadMxNParam(64, 32, &vpx_sad64x32_mmi), + SadMxNParam(32, 64, &vpx_sad32x64_mmi), + SadMxNParam(32, 32, &vpx_sad32x32_mmi), + SadMxNParam(32, 16, &vpx_sad32x16_mmi), + SadMxNParam(16, 32, &vpx_sad16x32_mmi), + SadMxNParam(16, 16, &vpx_sad16x16_mmi), + SadMxNParam(16, 8, &vpx_sad16x8_mmi), + SadMxNParam(8, 16, &vpx_sad8x16_mmi), + SadMxNParam(8, 8, &vpx_sad8x8_mmi), + SadMxNParam(8, 4, &vpx_sad8x4_mmi), + SadMxNParam(4, 8, &vpx_sad4x8_mmi), + SadMxNParam(4, 4, &vpx_sad4x4_mmi), +}; +INSTANTIATE_TEST_CASE_P(MMI, SADTest, ::testing::ValuesIn(mmi_tests)); + +const SadMxNAvgParam avg_mmi_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_mmi), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_mmi), + SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_mmi), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_mmi), + SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_mmi), + SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_mmi), + SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_mmi), + SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_mmi), + SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_mmi), + SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_mmi), + SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_mmi), + SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_mmi), + SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_mmi), +}; +INSTANTIATE_TEST_CASE_P(MMI, SADavgTest, ::testing::ValuesIn(avg_mmi_tests)); + +const SadMxNx4Param x4d_mmi_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_mmi), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_mmi), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_mmi), + SadMxNx4Param(32, 32, &vpx_sad32x32x4d_mmi), + SadMxNx4Param(32, 16, &vpx_sad32x16x4d_mmi), + SadMxNx4Param(16, 32, &vpx_sad16x32x4d_mmi), + SadMxNx4Param(16, 16, &vpx_sad16x16x4d_mmi), + SadMxNx4Param(16, 8, &vpx_sad16x8x4d_mmi), + SadMxNx4Param(8, 16, &vpx_sad8x16x4d_mmi), + SadMxNx4Param(8, 8, &vpx_sad8x8x4d_mmi), + SadMxNx4Param(8, 4, &vpx_sad8x4x4d_mmi), + SadMxNx4Param(4, 8, &vpx_sad4x8x4d_mmi), + SadMxNx4Param(4, 4, &vpx_sad4x4x4d_mmi), +}; +INSTANTIATE_TEST_CASE_P(MMI, SADx4Test, ::testing::ValuesIn(x4d_mmi_tests)); +#endif // HAVE_MMI } // namespace diff --git a/libvpx/test/set_roi.cc b/libvpx/test/set_roi.cc index 38711a806..f63954752 100644 --- a/libvpx/test/set_roi.cc +++ b/libvpx/test/set_roi.cc @@ -146,14 +146,6 @@ TEST(VP8RoiMapTest, ParameterCheck) { if (deltas_valid != roi_retval) break; } - // Test that we report and error if cyclic refresh is enabled. - cpi.cyclic_refresh_mode_enabled = 1; - roi_retval = - vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows, cpi.common.mb_cols, - delta_q, delta_lf, threshold); - EXPECT_EQ(-1, roi_retval) << "cyclic refresh check error"; - cpi.cyclic_refresh_mode_enabled = 0; - // Test invalid number of rows or colums. roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows + 1, diff --git a/libvpx/test/temporal_filter_test.cc b/libvpx/test/temporal_filter_test.cc index 8615ba45a..655a36be9 100644 --- a/libvpx/test/temporal_filter_test.cc +++ b/libvpx/test/temporal_filter_test.cc @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <limits> + #include "third_party/googletest/src/include/gtest/gtest.h" #include "./vp9_rtcd.h" @@ -35,6 +37,7 @@ void reference_filter(const Buffer<uint8_t> &a, const Buffer<uint8_t> &b, int w, Buffer<unsigned int> *accumulator, Buffer<uint16_t> *count) { Buffer<int> diff_sq = Buffer<int>(w, h, 0); + ASSERT_TRUE(diff_sq.Init()); diff_sq.Set(0); int rounding = 0; @@ -119,6 +122,7 @@ TEST_P(TemporalFilterTest, SizeCombinations) { // Depending on subsampling this function may be called with values of 8 or 16 // for width and height, in any combination. Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8); + ASSERT_TRUE(a.Init()); const int filter_weight = 2; const int filter_strength = 6; @@ -127,13 +131,20 @@ TEST_P(TemporalFilterTest, SizeCombinations) { for (int height = 8; height <= 16; height += 8) { // The second buffer must not have any border. Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0); + ASSERT_TRUE(b.Init()); Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0); + ASSERT_TRUE(accum_ref.Init()); Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0); + ASSERT_TRUE(accum_chk.Init()); Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0); + ASSERT_TRUE(count_ref.Init()); Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0); + ASSERT_TRUE(count_chk.Init()); - a.Set(&rnd_, &ACMRandom::Rand8); - b.Set(&rnd_, &ACMRandom::Rand8); + // The difference between the buffers must be small to pass the threshold + // to apply the filter. + a.Set(&rnd_, 0, 7); + b.Set(&rnd_, 0, 7); accum_ref.Set(rnd_.Rand8()); accum_chk.CopyFrom(accum_ref); @@ -161,18 +172,32 @@ TEST_P(TemporalFilterTest, CompareReferenceRandom) { for (int width = 8; width <= 16; width += 8) { for (int height = 8; height <= 16; height += 8) { Buffer<uint8_t> a = Buffer<uint8_t>(width, height, 8); + ASSERT_TRUE(a.Init()); // The second buffer must not have any border. Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0); + ASSERT_TRUE(b.Init()); Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0); + ASSERT_TRUE(accum_ref.Init()); Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0); + ASSERT_TRUE(accum_chk.Init()); Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0); + ASSERT_TRUE(count_ref.Init()); Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0); + ASSERT_TRUE(count_chk.Init()); for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) { for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) { - for (int repeat = 0; repeat < 10; ++repeat) { - a.Set(&rnd_, &ACMRandom::Rand8); - b.Set(&rnd_, &ACMRandom::Rand8); + for (int repeat = 0; repeat < 100; ++repeat) { + if (repeat < 50) { + a.Set(&rnd_, 0, 7); + b.Set(&rnd_, 0, 7); + } else { + // Check large (but close) values as well. + a.Set(&rnd_, std::numeric_limits<uint8_t>::max() - 7, + std::numeric_limits<uint8_t>::max()); + b.Set(&rnd_, std::numeric_limits<uint8_t>::max() - 7, + std::numeric_limits<uint8_t>::max()); + } accum_ref.Set(rnd_.Rand8()); accum_chk.CopyFrom(accum_ref); @@ -202,6 +227,7 @@ TEST_P(TemporalFilterTest, CompareReferenceRandom) { TEST_P(TemporalFilterTest, DISABLED_Speed) { Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8); + ASSERT_TRUE(a.Init()); const int filter_weight = 2; const int filter_strength = 6; @@ -210,13 +236,18 @@ TEST_P(TemporalFilterTest, DISABLED_Speed) { for (int height = 8; height <= 16; height += 8) { // The second buffer must not have any border. Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0); + ASSERT_TRUE(b.Init()); Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0); + ASSERT_TRUE(accum_ref.Init()); Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0); + ASSERT_TRUE(accum_chk.Init()); Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0); + ASSERT_TRUE(count_ref.Init()); Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0); + ASSERT_TRUE(count_chk.Init()); - a.Set(&rnd_, &ACMRandom::Rand8); - b.Set(&rnd_, &ACMRandom::Rand8); + a.Set(&rnd_, 0, 7); + b.Set(&rnd_, 0, 7); accum_chk.Set(0); count_chk.Set(0); diff --git a/libvpx/test/test-data.mk b/libvpx/test/test-data.mk index b39ab8763..f405e4ef1 100644 --- a/libvpx/test/test-data.mk +++ b/libvpx/test/test-data.mk @@ -732,6 +732,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv444.webm.md5 endif # CONFIG_VP9_HIGHBITDEPTH # Invalid files for testing libvpx error checking. +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm @@ -772,6 +774,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s367 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-1.webm @@ -874,3 +878,5 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm.md5 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_1.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_1.webm.md5 diff --git a/libvpx/test/test-data.sha1 b/libvpx/test/test-data.sha1 index 22ca6f564..99b4e1e46 100644 --- a/libvpx/test/test-data.sha1 +++ b/libvpx/test/test-data.sha1 @@ -6,6 +6,8 @@ b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv 456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res c123d1f9f02fb4143abb5e271916e3a3080de8f6 *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf 456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res +efafb92b7567bc04c3f1432ea6c268c1c31affd5 *invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf +5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf.res fe346136b9b8c1e6f6084cc106485706915795e4 *invalid-vp90-01-v3.webm 5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-01-v3.webm.res d78e2fceba5ac942246503ec8366f879c4775ca5 *invalid-vp90-02-v2.webm @@ -848,3 +850,7 @@ a000d568431d07379dd5a8ec066061c07e560b47 *invalid-vp90-2-00-quantizer-63.ivf.kf_ 6fa3d3ac306a3d9ce1d610b78441dc00d2c2d4b9 *tos_vp8.webm e402cbbf9e550ae017a1e9f1f73931c1d18474e8 *invalid-crbug-667044.webm d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-crbug-667044.webm.res +fd9df7f3f6992af1d7a9dde975c9a0d6f28c053d *invalid-bug-1443.ivf +fd3020fa6e9ca5966206738654c97dec313b0a95 *invalid-bug-1443.ivf.res +17696cd21e875f1d6e5d418cbf89feab02c8850a *vp90-2-22-svc_1280x720_1.webm +e2f9e1e47a791b4e939a9bdc50bf7a25b3761f77 *vp90-2-22-svc_1280x720_1.webm.md5 diff --git a/libvpx/test/test.mk b/libvpx/test/test.mk index c51e645c1..a3716be60 100644 --- a/libvpx/test/test.mk +++ b/libvpx/test/test.mk @@ -39,7 +39,6 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += byte_alignment_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += decode_svc_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_frame_parallel_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_refresh_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc @@ -124,6 +123,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc LIBVPX_TEST_SRCS-yes += idct_test.cc LIBVPX_TEST_SRCS-yes += predict_test.cc LIBVPX_TEST_SRCS-yes += vpx_scale_test.cc +LIBVPX_TEST_SRCS-yes += vpx_scale_test.h ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_TEMPORAL_DENOISING),yesyes) LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp8_denoiser_sse2_test.cc @@ -154,11 +154,15 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += avg_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += comp_avg_pred_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct_partial_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_scale_test.cc +ifneq ($(CONFIG_REALTIME_ONLY),yes) LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += temporal_filter_test.cc +endif LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc diff --git a/libvpx/test/test_intra_pred_speed.cc b/libvpx/test/test_intra_pred_speed.cc index 23fce335a..1cdeda410 100644 --- a/libvpx/test/test_intra_pred_speed.cc +++ b/libvpx/test/test_intra_pred_speed.cc @@ -480,29 +480,70 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_d63_predictor_32x32_c, vpx_highbd_tm_predictor_32x32_c) #if HAVE_SSE2 -HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred4, - vpx_highbd_dc_predictor_4x4_sse2, NULL, NULL, NULL, - vpx_highbd_v_predictor_4x4_sse2, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, vpx_highbd_tm_predictor_4x4_c) +HIGHBD_INTRA_PRED_TEST( + SSE2, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_sse2, + vpx_highbd_dc_left_predictor_4x4_sse2, vpx_highbd_dc_top_predictor_4x4_sse2, + vpx_highbd_dc_128_predictor_4x4_sse2, vpx_highbd_v_predictor_4x4_sse2, + vpx_highbd_h_predictor_4x4_sse2, NULL, vpx_highbd_d135_predictor_4x4_sse2, + vpx_highbd_d117_predictor_4x4_sse2, vpx_highbd_d153_predictor_4x4_sse2, + vpx_highbd_d207_predictor_4x4_sse2, vpx_highbd_d63_predictor_4x4_sse2, + vpx_highbd_tm_predictor_4x4_c) HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred8, - vpx_highbd_dc_predictor_8x8_sse2, NULL, NULL, NULL, - vpx_highbd_v_predictor_8x8_sse2, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2) + vpx_highbd_dc_predictor_8x8_sse2, + vpx_highbd_dc_left_predictor_8x8_sse2, + vpx_highbd_dc_top_predictor_8x8_sse2, + vpx_highbd_dc_128_predictor_8x8_sse2, + vpx_highbd_v_predictor_8x8_sse2, + vpx_highbd_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL, + NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2) HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16, - vpx_highbd_dc_predictor_16x16_sse2, NULL, NULL, NULL, - vpx_highbd_v_predictor_16x16_sse2, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, - vpx_highbd_tm_predictor_16x16_sse2) + vpx_highbd_dc_predictor_16x16_sse2, + vpx_highbd_dc_left_predictor_16x16_sse2, + vpx_highbd_dc_top_predictor_16x16_sse2, + vpx_highbd_dc_128_predictor_16x16_sse2, + vpx_highbd_v_predictor_16x16_sse2, + vpx_highbd_h_predictor_16x16_sse2, NULL, NULL, NULL, + NULL, NULL, NULL, vpx_highbd_tm_predictor_16x16_sse2) HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32, - vpx_highbd_dc_predictor_32x32_sse2, NULL, NULL, NULL, - vpx_highbd_v_predictor_32x32_sse2, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, - vpx_highbd_tm_predictor_32x32_sse2) + vpx_highbd_dc_predictor_32x32_sse2, + vpx_highbd_dc_left_predictor_32x32_sse2, + vpx_highbd_dc_top_predictor_32x32_sse2, + vpx_highbd_dc_128_predictor_32x32_sse2, + vpx_highbd_v_predictor_32x32_sse2, + vpx_highbd_h_predictor_32x32_sse2, NULL, NULL, NULL, + NULL, NULL, NULL, vpx_highbd_tm_predictor_32x32_sse2) #endif // HAVE_SSE2 +#if HAVE_SSSE3 +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred4, NULL, NULL, NULL, NULL, + NULL, NULL, vpx_highbd_d45_predictor_4x4_ssse3, NULL, + NULL, NULL, NULL, NULL, NULL) +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred8, NULL, NULL, NULL, NULL, + NULL, NULL, vpx_highbd_d45_predictor_8x8_ssse3, + vpx_highbd_d135_predictor_8x8_ssse3, + vpx_highbd_d117_predictor_8x8_ssse3, + vpx_highbd_d153_predictor_8x8_ssse3, + vpx_highbd_d207_predictor_8x8_ssse3, + vpx_highbd_d63_predictor_8x8_ssse3, NULL) +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred16, NULL, NULL, NULL, NULL, + NULL, NULL, vpx_highbd_d45_predictor_16x16_ssse3, + vpx_highbd_d135_predictor_16x16_ssse3, + vpx_highbd_d117_predictor_16x16_ssse3, + vpx_highbd_d153_predictor_16x16_ssse3, + vpx_highbd_d207_predictor_16x16_ssse3, + vpx_highbd_d63_predictor_16x16_ssse3, NULL) +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred32, NULL, NULL, NULL, NULL, + NULL, NULL, vpx_highbd_d45_predictor_32x32_ssse3, + vpx_highbd_d135_predictor_32x32_ssse3, + vpx_highbd_d117_predictor_32x32_ssse3, + vpx_highbd_d153_predictor_32x32_ssse3, + vpx_highbd_d207_predictor_32x32_ssse3, + vpx_highbd_d63_predictor_32x32_ssse3, NULL) +#endif // HAVE_SSSE3 + #if HAVE_NEON HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_neon, diff --git a/libvpx/test/test_libvpx.cc b/libvpx/test/test_libvpx.cc index 8a70b4e28..30641ae8c 100644 --- a/libvpx/test/test_libvpx.cc +++ b/libvpx/test/test_libvpx.cc @@ -53,6 +53,9 @@ int main(int argc, char **argv) { } if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*"); if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*"); + if (!(simd_caps & HAS_AVX512)) { + append_negative_gtest_filter(":AVX512.*:AVX512/*"); + } #endif // ARCH_X86 || ARCH_X86_64 #if !CONFIG_SHARED diff --git a/libvpx/test/test_vector_test.cc b/libvpx/test/test_vector_test.cc index 14c509d5c..1879b3d27 100644 --- a/libvpx/test/test_vector_test.cc +++ b/libvpx/test/test_vector_test.cc @@ -28,13 +28,10 @@ namespace { -enum DecodeMode { kSerialMode, kFrameParallelMode }; +const int kThreads = 0; +const int kFileName = 1; -const int kDecodeMode = 0; -const int kThreads = 1; -const int kFileName = 2; - -typedef std::tr1::tuple<int, int, const char *> DecodeParam; +typedef std::tr1::tuple<int, const char *> DecodeParam; class TestVectorTest : public ::libvpx_test::DecoderTest, public ::libvpx_test::CodecTestWithParam<DecodeParam> { @@ -53,8 +50,8 @@ class TestVectorTest : public ::libvpx_test::DecoderTest, void OpenMD5File(const std::string &md5_file_name_) { md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_); - ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: " - << md5_file_name_; + ASSERT_TRUE(md5_file_ != NULL) + << "Md5 file open failed. Filename: " << md5_file_name_; } virtual void DecompressedFrameHook(const vpx_image_t &img, @@ -92,29 +89,14 @@ class TestVectorTest : public ::libvpx_test::DecoderTest, TEST_P(TestVectorTest, MD5Match) { const DecodeParam input = GET_PARAM(1); const std::string filename = std::tr1::get<kFileName>(input); - const int threads = std::tr1::get<kThreads>(input); - const int mode = std::tr1::get<kDecodeMode>(input); vpx_codec_flags_t flags = 0; vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); char str[256]; - if (mode == kFrameParallelMode) { - flags |= VPX_CODEC_USE_FRAME_THREADING; -#if CONFIG_VP9_DECODER - // TODO(hkuang): Fix frame parallel decode bug. See issue 1086. - if (resize_clips_.find(filename) != resize_clips_.end()) { - printf("Skipping the test file: %s, due to frame parallel decode bug.\n", - filename.c_str()); - return; - } -#endif - } - - cfg.threads = threads; + cfg.threads = std::tr1::get<kThreads>(input); - snprintf(str, sizeof(str) / sizeof(str[0]) - 1, - "file: %s mode: %s threads: %d", filename.c_str(), - mode == 0 ? "Serial" : "Parallel", threads); + snprintf(str, sizeof(str) / sizeof(str[0]) - 1, "file: %s threads: %d", + filename.c_str(), cfg.threads); SCOPED_TRACE(str); // Open compressed video file. @@ -145,13 +127,10 @@ TEST_P(TestVectorTest, MD5Match) { ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg)); } -// Test VP8 decode in serial mode with single thread. -// NOTE: VP8 only support serial mode. #if CONFIG_VP8_DECODER VP8_INSTANTIATE_TEST_CASE( TestVectorTest, ::testing::Combine( - ::testing::Values(0), // Serial Mode. ::testing::Values(1), // Single thread. ::testing::ValuesIn(libvpx_test::kVP8TestVectors, libvpx_test::kVP8TestVectors + @@ -164,33 +143,28 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values( static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP8)), ::testing::Combine( - ::testing::Values(0), // Serial Mode. - ::testing::Range(1, 8), // With 1 ~ 8 threads. + ::testing::Range(2, 9), // With 2 ~ 8 threads. ::testing::ValuesIn(libvpx_test::kVP8TestVectors, libvpx_test::kVP8TestVectors + libvpx_test::kNumVP8TestVectors)))); #endif // CONFIG_VP8_DECODER -// Test VP9 decode in serial mode with single thread. #if CONFIG_VP9_DECODER VP9_INSTANTIATE_TEST_CASE( TestVectorTest, ::testing::Combine( - ::testing::Values(0), // Serial Mode. ::testing::Values(1), // Single thread. ::testing::ValuesIn(libvpx_test::kVP9TestVectors, libvpx_test::kVP9TestVectors + libvpx_test::kNumVP9TestVectors))); -// Test VP9 decode in frame parallel mode with different number of threads. INSTANTIATE_TEST_CASE_P( - DISABLED_VP9MultiThreadedFrameParallel, TestVectorTest, + VP9MultiThreaded, TestVectorTest, ::testing::Combine( ::testing::Values( static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)), ::testing::Combine( - ::testing::Values(1), // Frame Parallel mode. ::testing::Range(2, 9), // With 2 ~ 8 threads. ::testing::ValuesIn(libvpx_test::kVP9TestVectors, libvpx_test::kVP9TestVectors + diff --git a/libvpx/test/test_vectors.cc b/libvpx/test/test_vectors.cc index def78da28..3ffc3efc4 100644 --- a/libvpx/test/test_vectors.cc +++ b/libvpx/test/test_vectors.cc @@ -371,6 +371,7 @@ const char *const kVP9TestVectors[] = { #endif // CONFIG_VP9_HIGHBITDEPTH "vp90-2-20-big_superframe-01.webm", "vp90-2-20-big_superframe-02.webm", + "vp90-2-22-svc_1280x720_1.webm", RESIZE_TEST_VECTORS }; const char *const kVP9TestVectorsSvc[] = { "vp90-2-22-svc_1280x720_3.ivf" }; diff --git a/libvpx/test/twopass_encoder.sh b/libvpx/test/twopass_encoder.sh index 7a223f2af..eaeaabdfd 100755 --- a/libvpx/test/twopass_encoder.sh +++ b/libvpx/test/twopass_encoder.sh @@ -54,7 +54,10 @@ twopass_encoder_vp9() { fi } -twopass_encoder_tests="twopass_encoder_vp8 - twopass_encoder_vp9" -run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}" +if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" != "yes" ]; then + twopass_encoder_tests="twopass_encoder_vp8 + twopass_encoder_vp9" + + run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}" +fi diff --git a/libvpx/test/variance_test.cc b/libvpx/test/variance_test.cc index d607a097d..421024ad8 100644 --- a/libvpx/test/variance_test.cc +++ b/libvpx/test/variance_test.cc @@ -492,7 +492,7 @@ void MainTestClass<VarianceFunctionType>::SpeedTest() { vpx_usec_timer timer; vpx_usec_timer_start(&timer); - for (int i = 0; i < 100000000 / block_size(); ++i) { + for (int i = 0; i < (1 << 30) / block_size(); ++i) { const uint32_t variance = params_.func(src_, width(), ref_, width(), &sse); // Ignore return value. (void)variance; @@ -561,46 +561,26 @@ void MainTestClass<FunctionType>::MaxTestSse() { //////////////////////////////////////////////////////////////////////////////// -using ::std::tr1::get; -using ::std::tr1::make_tuple; -using ::std::tr1::tuple; - -template <typename SubpelVarianceFunctionType> +template <typename FunctionType> class SubpelVarianceTest - : public ::testing::TestWithParam< - tuple<int, int, SubpelVarianceFunctionType, int> > { + : public ::testing::TestWithParam<TestParams<FunctionType> > { public: virtual void SetUp() { - const tuple<int, int, SubpelVarianceFunctionType, int> ¶ms = - this->GetParam(); - log2width_ = get<0>(params); - width_ = 1 << log2width_; - log2height_ = get<1>(params); - height_ = 1 << log2height_; - subpel_variance_ = get<2>(params); - if (get<3>(params)) { - bit_depth_ = (vpx_bit_depth_t)get<3>(params); - use_high_bit_depth_ = true; - } else { - bit_depth_ = VPX_BITS_8; - use_high_bit_depth_ = false; - } - mask_ = (1 << bit_depth_) - 1; + params_ = this->GetParam(); rnd_.Reset(ACMRandom::DeterministicSeed()); - block_size_ = width_ * height_; - if (!use_high_bit_depth_) { - src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_)); - sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_)); - ref_ = new uint8_t[block_size_ + width_ + height_ + 1]; + if (!use_high_bit_depth()) { + src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size())); + sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size())); + ref_ = new uint8_t[block_size() + width() + height() + 1]; #if CONFIG_VP9_HIGHBITDEPTH } else { src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>( - vpx_memalign(16, block_size_ * sizeof(uint16_t)))); + vpx_memalign(16, block_size() * sizeof(uint16_t)))); sec_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>( - vpx_memalign(16, block_size_ * sizeof(uint16_t)))); - ref_ = - CONVERT_TO_BYTEPTR(new uint16_t[block_size_ + width_ + height_ + 1]); + vpx_memalign(16, block_size() * sizeof(uint16_t)))); + ref_ = CONVERT_TO_BYTEPTR( + new uint16_t[block_size() + width() + height() + 1]); #endif // CONFIG_VP9_HIGHBITDEPTH } ASSERT_TRUE(src_ != NULL); @@ -609,7 +589,7 @@ class SubpelVarianceTest } virtual void TearDown() { - if (!use_high_bit_depth_) { + if (!use_high_bit_depth()) { vpx_free(src_); delete[] ref_; vpx_free(sec_); @@ -631,42 +611,45 @@ class SubpelVarianceTest uint8_t *src_; uint8_t *ref_; uint8_t *sec_; - bool use_high_bit_depth_; - vpx_bit_depth_t bit_depth_; - int width_, log2width_; - int height_, log2height_; - int block_size_, mask_; - SubpelVarianceFunctionType subpel_variance_; + TestParams<FunctionType> params_; + + // some relay helpers + bool use_high_bit_depth() const { return params_.use_high_bit_depth; } + int byte_shift() const { return params_.bit_depth - 8; } + int block_size() const { return params_.block_size; } + int width() const { return params_.width; } + int height() const { return params_.height; } + uint32_t mask() const { return params_.mask; } }; template <typename SubpelVarianceFunctionType> void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() { for (int x = 0; x < 8; ++x) { for (int y = 0; y < 8; ++y) { - if (!use_high_bit_depth_) { - for (int j = 0; j < block_size_; j++) { + if (!use_high_bit_depth()) { + for (int j = 0; j < block_size(); j++) { src_[j] = rnd_.Rand8(); } - for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { + for (int j = 0; j < block_size() + width() + height() + 1; j++) { ref_[j] = rnd_.Rand8(); } #if CONFIG_VP9_HIGHBITDEPTH } else { - for (int j = 0; j < block_size_; j++) { - CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_; + for (int j = 0; j < block_size(); j++) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); } - for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { - CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_; + for (int j = 0; j < block_size() + width() + height() + 1; j++) { + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); } #endif // CONFIG_VP9_HIGHBITDEPTH } unsigned int sse1, sse2; unsigned int var1; ASM_REGISTER_STATE_CHECK( - var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1)); - const unsigned int var2 = - subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2, - use_high_bit_depth_, bit_depth_); + var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1)); + const unsigned int var2 = subpel_variance_ref( + ref_, src_, params_.log2width, params_.log2height, x, y, &sse2, + use_high_bit_depth(), params_.bit_depth); EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y; EXPECT_EQ(var1, var2) << "at position " << x << ", " << y; } @@ -680,28 +663,28 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() { // Ref: Set the first half of values to the maximum, the second half to 0. for (int x = 0; x < 8; ++x) { for (int y = 0; y < 8; ++y) { - const int half = block_size_ / 2; - if (!use_high_bit_depth_) { + const int half = block_size() / 2; + if (!use_high_bit_depth()) { memset(src_, 0, half); memset(src_ + half, 255, half); memset(ref_, 255, half); - memset(ref_ + half, 0, half + width_ + height_ + 1); + memset(ref_ + half, 0, half + width() + height() + 1); #if CONFIG_VP9_HIGHBITDEPTH } else { - vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask_, half); + vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask(), half); vpx_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half); vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half); - vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask_, - half + width_ + height_ + 1); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask(), + half + width() + height() + 1); #endif // CONFIG_VP9_HIGHBITDEPTH } unsigned int sse1, sse2; unsigned int var1; ASM_REGISTER_STATE_CHECK( - var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1)); - const unsigned int var2 = - subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2, - use_high_bit_depth_, bit_depth_); + var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1)); + const unsigned int var2 = subpel_variance_ref( + ref_, src_, params_.log2width, params_.log2height, x, y, &sse2, + use_high_bit_depth(), params_.bit_depth); EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y; EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y; } @@ -712,33 +695,32 @@ template <> void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() { for (int x = 0; x < 8; ++x) { for (int y = 0; y < 8; ++y) { - if (!use_high_bit_depth_) { - for (int j = 0; j < block_size_; j++) { + if (!use_high_bit_depth()) { + for (int j = 0; j < block_size(); j++) { src_[j] = rnd_.Rand8(); sec_[j] = rnd_.Rand8(); } - for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { + for (int j = 0; j < block_size() + width() + height() + 1; j++) { ref_[j] = rnd_.Rand8(); } #if CONFIG_VP9_HIGHBITDEPTH } else { - for (int j = 0; j < block_size_; j++) { - CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_; - CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask_; + for (int j = 0; j < block_size(); j++) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); + CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask(); } - for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { - CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_; + for (int j = 0; j < block_size() + width() + height() + 1; j++) { + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); } #endif // CONFIG_VP9_HIGHBITDEPTH } uint32_t sse1, sse2; uint32_t var1, var2; - ASM_REGISTER_STATE_CHECK(var1 = - subpel_variance_(ref_, width_ + 1, x, y, - src_, width_, &sse1, sec_)); - var2 = subpel_avg_variance_ref(ref_, src_, sec_, log2width_, log2height_, - x, y, &sse2, use_high_bit_depth_, - static_cast<vpx_bit_depth_t>(bit_depth_)); + ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 1, x, y, + src_, width(), &sse1, sec_)); + var2 = subpel_avg_variance_ref(ref_, src_, sec_, params_.log2width, + params_.log2height, x, y, &sse2, + use_high_bit_depth(), params_.bit_depth); EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y; EXPECT_EQ(var1, var2) << "at position " << x << ", " << y; } @@ -798,37 +780,41 @@ INSTANTIATE_TEST_CASE_P( VarianceParams(2, 3, &vpx_variance4x8_c), VarianceParams(2, 2, &vpx_variance4x4_c))); +typedef TestParams<SubpixVarMxNFunc> SubpelVarianceParams; INSTANTIATE_TEST_CASE_P( C, VpxSubpelVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_c, 0), - make_tuple(6, 5, &vpx_sub_pixel_variance64x32_c, 0), - make_tuple(5, 6, &vpx_sub_pixel_variance32x64_c, 0), - make_tuple(5, 5, &vpx_sub_pixel_variance32x32_c, 0), - make_tuple(5, 4, &vpx_sub_pixel_variance32x16_c, 0), - make_tuple(4, 5, &vpx_sub_pixel_variance16x32_c, 0), - make_tuple(4, 4, &vpx_sub_pixel_variance16x16_c, 0), - make_tuple(4, 3, &vpx_sub_pixel_variance16x8_c, 0), - make_tuple(3, 4, &vpx_sub_pixel_variance8x16_c, 0), - make_tuple(3, 3, &vpx_sub_pixel_variance8x8_c, 0), - make_tuple(3, 2, &vpx_sub_pixel_variance8x4_c, 0), - make_tuple(2, 3, &vpx_sub_pixel_variance4x8_c, 0), - make_tuple(2, 2, &vpx_sub_pixel_variance4x4_c, 0))); - + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_c, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_c, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_c, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_c, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_c, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_c, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_c, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_c, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_c, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_c, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_c, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_c, 0), + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_c, 0))); + +typedef TestParams<SubpixAvgVarMxNFunc> SubpelAvgVarianceParams; INSTANTIATE_TEST_CASE_P( C, VpxSubpelAvgVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0), - make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0), - make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0), - make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0), - make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0), - make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0), - make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0), - make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0), - make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0), - make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0), - make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0), - make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0), - make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0))); + ::testing::Values( + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0))); #if CONFIG_VP9_HIGHBITDEPTH typedef MainTestClass<VarianceMxNFunc> VpxHBDMseTest; @@ -850,18 +836,18 @@ TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); } /* TODO(debargha): This test does not support the highbd version INSTANTIATE_TEST_CASE_P( C, VpxHBDMseTest, - ::testing::Values(make_tuple(4, 4, &vpx_highbd_12_mse16x16_c), - make_tuple(4, 4, &vpx_highbd_12_mse16x8_c), - make_tuple(4, 4, &vpx_highbd_12_mse8x16_c), - make_tuple(4, 4, &vpx_highbd_12_mse8x8_c), - make_tuple(4, 4, &vpx_highbd_10_mse16x16_c), - make_tuple(4, 4, &vpx_highbd_10_mse16x8_c), - make_tuple(4, 4, &vpx_highbd_10_mse8x16_c), - make_tuple(4, 4, &vpx_highbd_10_mse8x8_c), - make_tuple(4, 4, &vpx_highbd_8_mse16x16_c), - make_tuple(4, 4, &vpx_highbd_8_mse16x8_c), - make_tuple(4, 4, &vpx_highbd_8_mse8x16_c), - make_tuple(4, 4, &vpx_highbd_8_mse8x8_c))); + ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c), + MseParams(4, 4, &vpx_highbd_12_mse16x8_c), + MseParams(4, 4, &vpx_highbd_12_mse8x16_c), + MseParams(4, 4, &vpx_highbd_12_mse8x8_c), + MseParams(4, 4, &vpx_highbd_10_mse16x16_c), + MseParams(4, 4, &vpx_highbd_10_mse16x8_c), + MseParams(4, 4, &vpx_highbd_10_mse8x16_c), + MseParams(4, 4, &vpx_highbd_10_mse8x8_c), + MseParams(4, 4, &vpx_highbd_8_mse16x16_c), + MseParams(4, 4, &vpx_highbd_8_mse16x8_c), + MseParams(4, 4, &vpx_highbd_8_mse8x16_c), + MseParams(4, 4, &vpx_highbd_8_mse8x8_c))); */ INSTANTIATE_TEST_CASE_P( @@ -909,88 +895,161 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( C, VpxHBDSubpelVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8), - make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8), - make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8), - make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8), - make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8), - make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8), - make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8), - make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8), - make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8), - make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8), - make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8), - make_tuple(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8), - make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8), - make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c, 10), - make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c, 10), - make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c, 10), - make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c, 10), - make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c, 10), - make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c, 10), - make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c, 10), - make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10), - make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10), - make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10), - make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10), - make_tuple(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10), - make_tuple(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10), - make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c, 12), - make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c, 12), - make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c, 12), - make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c, 12), - make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c, 12), - make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c, 12), - make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c, 12), - make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12), - make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12), - make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12), - make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12), - make_tuple(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12), - make_tuple(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c, 12))); + SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8), + SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8), + SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8), + SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8), + SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8), + SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8), + SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8), + SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8), + SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8), + SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8), + SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8), + SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8), + SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8), + SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c, + 10), + SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c, + 10), + SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c, + 10), + SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c, + 10), + SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c, + 10), + SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c, + 10), + SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c, + 10), + SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10), + SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10), + SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10), + SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10), + SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10), + SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10), + SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c, + 12), + SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c, + 12), + SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c, + 12), + SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c, + 12), + SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c, + 12), + SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c, + 12), + SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c, + 12), + SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12), + SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12), + SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12), + SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12), + SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12), + SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c, + 12))); INSTANTIATE_TEST_CASE_P( C, VpxHBDSubpelAvgVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8), - make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8), - make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8), - make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8), - make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8), - make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8), - make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8), - make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8), - make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8), - make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c, 8), - make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c, 8), - make_tuple(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c, 8), - make_tuple(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c, 8), - make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_c, 10), - make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_c, 10), - make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_c, 10), - make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_c, 10), - make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_c, 10), - make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_c, 10), - make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_c, 10), - make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_c, 10), - make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_c, 10), - make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10), - make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10), - make_tuple(2, 3, &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10), - make_tuple(2, 2, &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10), - make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_c, 12), - make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_c, 12), - make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_c, 12), - make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_c, 12), - make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_c, 12), - make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_c, 12), - make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_c, 12), - make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_c, 12), - make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_c, 12), - make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12), - make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12), - make_tuple(2, 3, &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12), - make_tuple(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_c, 12))); + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8), + SubpelAvgVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c, + 8), + SubpelAvgVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c, + 8), + SubpelAvgVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c, + 8), + SubpelAvgVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c, + 8), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_10_sub_pixel_avg_variance64x64_c, + 10), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_10_sub_pixel_avg_variance64x32_c, + 10), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_10_sub_pixel_avg_variance32x64_c, + 10), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_10_sub_pixel_avg_variance32x32_c, + 10), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_10_sub_pixel_avg_variance32x16_c, + 10), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_10_sub_pixel_avg_variance16x32_c, + 10), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_10_sub_pixel_avg_variance16x16_c, + 10), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_10_sub_pixel_avg_variance16x8_c, + 10), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_10_sub_pixel_avg_variance8x16_c, + 10), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_12_sub_pixel_avg_variance64x64_c, + 12), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_12_sub_pixel_avg_variance64x32_c, + 12), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_12_sub_pixel_avg_variance32x64_c, + 12), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_12_sub_pixel_avg_variance32x32_c, + 12), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_12_sub_pixel_avg_variance32x16_c, + 12), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_12_sub_pixel_avg_variance16x32_c, + 12), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_12_sub_pixel_avg_variance16x16_c, + 12), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_12_sub_pixel_avg_variance16x8_c, + 12), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_12_sub_pixel_avg_variance8x16_c, + 12), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_12_sub_pixel_avg_variance4x4_c, + 12))); #endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_SSE2 @@ -1021,36 +1080,37 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( SSE2, VpxSubpelVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0), - make_tuple(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0), - make_tuple(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0), - make_tuple(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0), - make_tuple(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0), - make_tuple(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0), - make_tuple(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0), - make_tuple(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0), - make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0), - make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0), - make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0), - make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0), - make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0))); + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0), + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0))); INSTANTIATE_TEST_CASE_P( SSE2, VpxSubpelAvgVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0), - make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0), - make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0), - make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0), - make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0), - make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0), - make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0), - make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0), - make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0), - make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0), - make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0), - make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0), - make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0))); + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0))); #if CONFIG_VP9_HIGHBITDEPTH /* TODO(debargha): This test does not support the highbd version @@ -1107,112 +1167,219 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( SSE2, VpxHBDSubpelVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2, 12), - make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2, 12), - make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2, 12), - make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2, 12), - make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2, 12), - make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2, 12), - make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2, 12), - make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2, 12), - make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2, 12), - make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2, 12), - make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2, 12), - make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2, 10), - make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2, 10), - make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2, 10), - make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2, 10), - make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2, 10), - make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2, 10), - make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2, 10), - make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2, 10), - make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2, 10), - make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2, 10), - make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2, 10), - make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2, 8), - make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2, 8), - make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2, 8), - make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2, 8), - make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2, 8), - make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2, 8), - make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2, 8), - make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2, 8), - make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2, 8), - make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8), - make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2, 8))); + SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2, + 12), + SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2, + 12), + SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2, + 12), + SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2, + 12), + SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2, + 12), + SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2, + 12), + SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2, + 12), + SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2, + 12), + SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2, + 12), + SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2, + 12), + SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2, + 12), + SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2, + 10), + SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2, + 10), + SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2, + 10), + SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2, + 10), + SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2, + 10), + SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2, + 10), + SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2, + 10), + SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2, + 10), + SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2, + 10), + SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2, + 10), + SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2, + 10), + SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2, + 8), + SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2, + 8), + SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2, + 8), + SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2, + 8), + SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2, + 8), + SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2, + 8), + SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2, + 8), + SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2, + 8), + SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2, + 8), + SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8), + SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2, + 8))); INSTANTIATE_TEST_CASE_P( SSE2, VpxHBDSubpelAvgVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2, 12), - make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2, 12), - make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2, 12), - make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2, 12), - make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2, 12), - make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2, 12), - make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2, 12), - make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2, 12), - make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2, 12), - make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2, 12), - make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2, 12), - make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2, 10), - make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2, 10), - make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2, 10), - make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2, 10), - make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2, 10), - make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2, 10), - make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2, 10), - make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2, 10), - make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2, 10), - make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2, 10), - make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2, 10), - make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2, 8), - make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2, 8), - make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2, 8), - make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2, 8), - make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2, 8), - make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2, 8), - make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2, 8), - make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2, 8), - make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2, 8), - make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2, 8), - make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2, 8))); + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2, + 12), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2, + 12), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2, + 12), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2, + 12), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2, + 12), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2, + 12), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2, + 12), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2, + 12), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2, + 12), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2, + 12), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2, + 12), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2, + 10), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2, + 10), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2, + 10), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2, + 10), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2, + 10), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2, + 10), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2, + 10), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2, + 10), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2, + 10), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2, + 10), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2, + 10), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2, + 8), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2, + 8), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2, + 8), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2, + 8), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2, + 8), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2, + 8), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2, + 8), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2, + 8), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2, + 8), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2, + 8), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2, + 8))); #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_SSE2 #if HAVE_SSSE3 INSTANTIATE_TEST_CASE_P( SSSE3, VpxSubpelVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0), - make_tuple(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0), - make_tuple(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0), - make_tuple(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0), - make_tuple(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0), - make_tuple(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0), - make_tuple(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0), - make_tuple(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0), - make_tuple(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0), - make_tuple(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0), - make_tuple(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0), - make_tuple(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0), - make_tuple(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0))); + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0), + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0))); INSTANTIATE_TEST_CASE_P( SSSE3, VpxSubpelAvgVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3, 0), - make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3, 0), - make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3, 0), - make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3, 0), - make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3, 0), - make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3, 0), - make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3, 0), - make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0), - make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0), - make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0), - make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0), - make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0), - make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3, 0))); + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3, + 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3, + 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3, + 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3, + 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3, + 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3, + 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3, + 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3, + 0))); #endif // HAVE_SSSE3 #if HAVE_AVX2 @@ -1229,14 +1396,16 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( AVX2, VpxSubpelVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0), - make_tuple(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0))); + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0))); INSTANTIATE_TEST_CASE_P( AVX2, VpxSubpelAvgVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0), - make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2, 0))); + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2, + 0))); #endif // HAVE_AVX2 #if HAVE_NEON @@ -1265,17 +1434,37 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( NEON, VpxSubpelVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_neon, 0), - make_tuple(6, 5, &vpx_sub_pixel_variance64x32_neon, 0), - make_tuple(5, 6, &vpx_sub_pixel_variance32x64_neon, 0), - make_tuple(5, 5, &vpx_sub_pixel_variance32x32_neon, 0), - make_tuple(5, 4, &vpx_sub_pixel_variance32x16_neon, 0), - make_tuple(4, 5, &vpx_sub_pixel_variance16x32_neon, 0), - make_tuple(4, 4, &vpx_sub_pixel_variance16x16_neon, 0), - make_tuple(4, 3, &vpx_sub_pixel_variance16x8_neon, 0), - make_tuple(3, 4, &vpx_sub_pixel_variance8x16_neon, 0), - make_tuple(3, 3, &vpx_sub_pixel_variance8x8_neon, 0), - make_tuple(3, 2, &vpx_sub_pixel_variance8x4_neon, 0))); + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_neon, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_neon, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_neon, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_neon, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_neon, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_neon, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_neon, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_neon, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_neon, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_neon, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_neon, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_neon, 0), + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_neon, 0))); + +INSTANTIATE_TEST_CASE_P( + NEON, VpxSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_neon, 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_neon, 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_neon, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_neon, 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_neon, 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_neon, 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_neon, 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_neon, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_neon, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_neon, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_neon, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_neon, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0))); #endif // HAVE_NEON #if HAVE_MSA @@ -1310,35 +1499,37 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( MSA, VpxSubpelVarianceTest, - ::testing::Values(make_tuple(2, 2, &vpx_sub_pixel_variance4x4_msa, 0), - make_tuple(2, 3, &vpx_sub_pixel_variance4x8_msa, 0), - make_tuple(3, 2, &vpx_sub_pixel_variance8x4_msa, 0), - make_tuple(3, 3, &vpx_sub_pixel_variance8x8_msa, 0), - make_tuple(3, 4, &vpx_sub_pixel_variance8x16_msa, 0), - make_tuple(4, 3, &vpx_sub_pixel_variance16x8_msa, 0), - make_tuple(4, 4, &vpx_sub_pixel_variance16x16_msa, 0), - make_tuple(4, 5, &vpx_sub_pixel_variance16x32_msa, 0), - make_tuple(5, 4, &vpx_sub_pixel_variance32x16_msa, 0), - make_tuple(5, 5, &vpx_sub_pixel_variance32x32_msa, 0), - make_tuple(5, 6, &vpx_sub_pixel_variance32x64_msa, 0), - make_tuple(6, 5, &vpx_sub_pixel_variance64x32_msa, 0), - make_tuple(6, 6, &vpx_sub_pixel_variance64x64_msa, 0))); + ::testing::Values( + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_msa, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_msa, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_msa, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_msa, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_msa, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_msa, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_msa, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_msa, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_msa, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_msa, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_msa, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_msa, 0), + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_msa, 0))); INSTANTIATE_TEST_CASE_P( MSA, VpxSubpelAvgVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0), - make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0), - make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0), - make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0), - make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0), - make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0), - make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0), - make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0), - make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0), - make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0), - make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0), - make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0), - make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0))); + ::testing::Values( + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0))); #endif // HAVE_MSA #if HAVE_VSX @@ -1349,4 +1540,62 @@ INSTANTIATE_TEST_CASE_P(VSX, VpxSseTest, ::testing::Values(SseParams(2, 2, &vpx_get4x4sse_cs_vsx))); #endif // HAVE_VSX + +#if HAVE_MMI +INSTANTIATE_TEST_CASE_P(MMI, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_mmi), + MseParams(4, 3, &vpx_mse16x8_mmi), + MseParams(3, 4, &vpx_mse8x16_mmi), + MseParams(3, 3, &vpx_mse8x8_mmi))); + +INSTANTIATE_TEST_CASE_P( + MMI, VpxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_mmi), + VarianceParams(6, 5, &vpx_variance64x32_mmi), + VarianceParams(5, 6, &vpx_variance32x64_mmi), + VarianceParams(5, 5, &vpx_variance32x32_mmi), + VarianceParams(5, 4, &vpx_variance32x16_mmi), + VarianceParams(4, 5, &vpx_variance16x32_mmi), + VarianceParams(4, 4, &vpx_variance16x16_mmi), + VarianceParams(4, 3, &vpx_variance16x8_mmi), + VarianceParams(3, 4, &vpx_variance8x16_mmi), + VarianceParams(3, 3, &vpx_variance8x8_mmi), + VarianceParams(3, 2, &vpx_variance8x4_mmi), + VarianceParams(2, 3, &vpx_variance4x8_mmi), + VarianceParams(2, 2, &vpx_variance4x4_mmi))); + +INSTANTIATE_TEST_CASE_P( + MMI, VpxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_mmi, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_mmi, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_mmi, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_mmi, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_mmi, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_mmi, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_mmi, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_mmi, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_mmi, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_mmi, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_mmi, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_mmi, 0), + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_mmi, 0))); + +INSTANTIATE_TEST_CASE_P( + MMI, VpxSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_mmi, 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_mmi, 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_mmi, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_mmi, 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_mmi, 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_mmi, 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_mmi, 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_mmi, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_mmi, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_mmi, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_mmi, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_mmi, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_mmi, 0))); +#endif // HAVE_MMI } // namespace diff --git a/libvpx/test/vp8_fdct4x4_test.cc b/libvpx/test/vp8_fdct4x4_test.cc index 9f69ae164..b7697d859 100644 --- a/libvpx/test/vp8_fdct4x4_test.cc +++ b/libvpx/test/vp8_fdct4x4_test.cc @@ -199,4 +199,8 @@ INSTANTIATE_TEST_CASE_P(SSE2, FdctTest, INSTANTIATE_TEST_CASE_P(MSA, FdctTest, ::testing::Values(vp8_short_fdct4x4_msa)); #endif // HAVE_MSA +#if HAVE_MMI +INSTANTIATE_TEST_CASE_P(MMI, FdctTest, + ::testing::Values(vp8_short_fdct4x4_mmi)); +#endif // HAVE_MMI } // namespace diff --git a/libvpx/test/vp9_encoder_parms_get_to_decoder.cc b/libvpx/test/vp9_encoder_parms_get_to_decoder.cc index 53dc8c9fe..62e8dcb9b 100644 --- a/libvpx/test/vp9_encoder_parms_get_to_decoder.cc +++ b/libvpx/test/vp9_encoder_parms_get_to_decoder.cc @@ -99,9 +99,7 @@ class VpxEncoderParmsGetToDecoder vpx_codec_ctx_t *const vp9_decoder = decoder->GetDecoder(); vpx_codec_alg_priv_t *const priv = reinterpret_cast<vpx_codec_alg_priv_t *>(vp9_decoder->priv); - FrameWorkerData *const worker_data = - reinterpret_cast<FrameWorkerData *>(priv->frame_workers[0].data1); - VP9_COMMON *const common = &worker_data->pbi->common; + VP9_COMMON *const common = &priv->pbi->common; if (encode_parms.lossless) { EXPECT_EQ(0, common->base_qindex); diff --git a/libvpx/test/vp9_ethread_test.cc b/libvpx/test/vp9_ethread_test.cc index 4e8d814c1..6b7e51211 100644 --- a/libvpx/test/vp9_ethread_test.cc +++ b/libvpx/test/vp9_ethread_test.cc @@ -50,7 +50,6 @@ class VPxFirstPassEncoderThreadTest InitializeConfig(); SetMode(encoding_mode_); - cfg_.g_lag_in_frames = 3; cfg_.rc_end_usage = VPX_VBR; cfg_.rc_2pass_vbr_minsection_pct = 5; cfg_.rc_2pass_vbr_maxsection_pct = 2000; @@ -128,8 +127,10 @@ static void compare_fp_stats(vpx_fixed_buf_t *fp_stats, double factor) { const double *frame_stats2 = reinterpret_cast<double *>(stats2); for (j = 0; j < kDbl; ++j) { - EXPECT_LE(fabs(*frame_stats1 - *frame_stats2), - fabs(*frame_stats1) / factor); + ASSERT_LE(fabs(*frame_stats1 - *frame_stats2), + fabs(*frame_stats1) / factor) + << "First failure @ frame #" << i << " stat #" << j << " (" + << *frame_stats1 << " vs. " << *frame_stats2 << ")"; frame_stats1++; frame_stats2++; } @@ -183,7 +184,7 @@ TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); // Compare to check if using or not using row-mt generates close stats. - compare_fp_stats(&firstpass_stats_, 1000.0); + ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0)); // Test single thread vs multiple threads row_mt_mode_ = 1; @@ -197,7 +198,7 @@ TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); // Compare to check if single-thread and multi-thread stats are close enough. - compare_fp_stats(&firstpass_stats_, 1000.0); + ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0)); // Bit exact test in row_mt mode. // When row_mt_mode_=1 and using >1 threads, the encoder generates bit exact @@ -238,7 +239,6 @@ class VPxEncoderThreadTest SetMode(encoding_mode_); if (encoding_mode_ != ::libvpx_test::kRealTime) { - cfg_.g_lag_in_frames = 3; cfg_.rc_end_usage = VPX_VBR; cfg_.rc_2pass_vbr_minsection_pct = 5; cfg_.rc_2pass_vbr_maxsection_pct = 2000; @@ -340,8 +340,6 @@ TEST_P(VPxEncoderThreadTest, EncoderResultTest) { ASSERT_EQ(single_thr_md5, multi_thr_md5); // Part 2: row_mt_mode_ = 0 vs row_mt_mode_ = 1 single thread bit exact test. - // The first-pass stats are not bit exact here, but that difference doesn't - // cause a mismatch between the final bitstreams. row_mt_mode_ = 1; // Encode using single thread diff --git a/libvpx/test/vp9_frame_parallel_test.cc b/libvpx/test/vp9_frame_parallel_test.cc deleted file mode 100644 index 136557720..000000000 --- a/libvpx/test/vp9_frame_parallel_test.cc +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <cstdio> -#include <cstdlib> -#include <string> -#include "third_party/googletest/src/include/gtest/gtest.h" -#include "./vpx_config.h" -#include "test/codec_factory.h" -#include "test/decode_test_driver.h" -#include "test/ivf_video_source.h" -#include "test/md5_helper.h" -#include "test/util.h" -#if CONFIG_WEBM_IO -#include "test/webm_video_source.h" -#endif -#include "vpx_mem/vpx_mem.h" - -namespace { - -using std::string; - -#if CONFIG_WEBM_IO - -struct PauseFileList { - const char *name; - // md5 sum for decoded frames which does not include skipped frames. - const char *expected_md5; - const int pause_frame_num; -}; - -// Decodes |filename| with |num_threads|. Pause at the specified frame_num, -// seek to next key frame and then continue decoding until the end. Return -// the md5 of the decoded frames which does not include skipped frames. -string DecodeFileWithPause(const string &filename, int num_threads, - int pause_num) { - libvpx_test::WebMVideoSource video(filename); - video.Init(); - int in_frames = 0; - int out_frames = 0; - - vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); - cfg.threads = num_threads; - vpx_codec_flags_t flags = 0; - flags |= VPX_CODEC_USE_FRAME_THREADING; - libvpx_test::VP9Decoder decoder(cfg, flags); - - libvpx_test::MD5 md5; - video.Begin(); - - do { - ++in_frames; - const vpx_codec_err_t res = - decoder.DecodeFrame(video.cxdata(), video.frame_size()); - if (res != VPX_CODEC_OK) { - EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError(); - break; - } - - // Pause at specified frame number. - if (in_frames == pause_num) { - // Flush the decoder and then seek to next key frame. - decoder.DecodeFrame(NULL, 0); - video.SeekToNextKeyFrame(); - } else { - video.Next(); - } - - // Flush the decoder at the end of the video. - if (!video.cxdata()) decoder.DecodeFrame(NULL, 0); - - libvpx_test::DxDataIterator dec_iter = decoder.GetDxData(); - const vpx_image_t *img; - - // Get decompressed data - while ((img = dec_iter.Next())) { - ++out_frames; - md5.Add(img); - } - } while (video.cxdata() != NULL); - - EXPECT_EQ(in_frames, out_frames) - << "Input frame count does not match output frame count"; - - return string(md5.Get()); -} - -void DecodeFilesWithPause(const PauseFileList files[]) { - for (const PauseFileList *iter = files; iter->name != NULL; ++iter) { - SCOPED_TRACE(iter->name); - for (int t = 2; t <= 8; ++t) { - EXPECT_EQ(iter->expected_md5, - DecodeFileWithPause(iter->name, t, iter->pause_frame_num)) - << "threads = " << t; - } - } -} - -TEST(DISABLED_VP9MultiThreadedFrameParallel, PauseSeekResume) { - // vp90-2-07-frame_parallel-1.webm is a 40 frame video file with - // one key frame for every ten frames. - static const PauseFileList files[] = { - { "vp90-2-07-frame_parallel-1.webm", "6ea7c3875d67252e7caf2bc6e75b36b1", - 6 }, - { "vp90-2-07-frame_parallel-1.webm", "4bb634160c7356a8d7d4299b6dc83a45", - 12 }, - { "vp90-2-07-frame_parallel-1.webm", "89772591e6ef461f9fa754f916c78ed8", - 26 }, - { NULL, NULL, 0 }, - }; - DecodeFilesWithPause(files); -} - -struct FileList { - const char *name; - // md5 sum for decoded frames which does not include corrupted frames. - const char *expected_md5; - // Expected number of decoded frames which does not include corrupted frames. - const int expected_frame_count; -}; - -// Decodes |filename| with |num_threads|. Return the md5 of the decoded -// frames which does not include corrupted frames. -string DecodeFile(const string &filename, int num_threads, - int expected_frame_count) { - libvpx_test::WebMVideoSource video(filename); - video.Init(); - - vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); - cfg.threads = num_threads; - const vpx_codec_flags_t flags = VPX_CODEC_USE_FRAME_THREADING; - libvpx_test::VP9Decoder decoder(cfg, flags); - - libvpx_test::MD5 md5; - video.Begin(); - - int out_frames = 0; - do { - const vpx_codec_err_t res = - decoder.DecodeFrame(video.cxdata(), video.frame_size()); - // TODO(hkuang): frame parallel mode should return an error on corruption. - if (res != VPX_CODEC_OK) { - EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError(); - break; - } - - video.Next(); - - // Flush the decoder at the end of the video. - if (!video.cxdata()) decoder.DecodeFrame(NULL, 0); - - libvpx_test::DxDataIterator dec_iter = decoder.GetDxData(); - const vpx_image_t *img; - - // Get decompressed data - while ((img = dec_iter.Next())) { - ++out_frames; - md5.Add(img); - } - } while (video.cxdata() != NULL); - - EXPECT_EQ(expected_frame_count, out_frames) - << "Input frame count does not match expected output frame count"; - - return string(md5.Get()); -} - -void DecodeFiles(const FileList files[]) { - for (const FileList *iter = files; iter->name != NULL; ++iter) { - SCOPED_TRACE(iter->name); - for (int t = 2; t <= 8; ++t) { - EXPECT_EQ(iter->expected_md5, - DecodeFile(iter->name, t, iter->expected_frame_count)) - << "threads = " << t; - } - } -} - -TEST(DISABLED_VP9MultiThreadedFrameParallel, InvalidFileTest) { - static const FileList files[] = { - // invalid-vp90-2-07-frame_parallel-1.webm is a 40 frame video file with - // one key frame for every ten frames. The 11th frame has corrupted data. - { "invalid-vp90-2-07-frame_parallel-1.webm", - "0549d0f45f60deaef8eb708e6c0eb6cb", 30 }, - // invalid-vp90-2-07-frame_parallel-2.webm is a 40 frame video file with - // one key frame for every ten frames. The 1st and 31st frames have - // corrupted data. - { "invalid-vp90-2-07-frame_parallel-2.webm", - "6a1f3cf6f9e7a364212fadb9580d525e", 20 }, - // invalid-vp90-2-07-frame_parallel-3.webm is a 40 frame video file with - // one key frame for every ten frames. The 5th and 13th frames have - // corrupted data. - { "invalid-vp90-2-07-frame_parallel-3.webm", - "8256544308de926b0681e04685b98677", 27 }, - { NULL, NULL, 0 }, - }; - DecodeFiles(files); -} - -TEST(DISABLED_VP9MultiThreadedFrameParallel, ValidFileTest) { - static const FileList files[] = { -#if CONFIG_VP9_HIGHBITDEPTH - { "vp92-2-20-10bit-yuv420.webm", "a16b99df180c584e8db2ffeda987d293", 10 }, -#endif - { NULL, NULL, 0 }, - }; - DecodeFiles(files); -} -#endif // CONFIG_WEBM_IO -} // namespace diff --git a/libvpx/test/vp9_intrapred_test.cc b/libvpx/test/vp9_intrapred_test.cc index bee0213ea..39c5e79eb 100644 --- a/libvpx/test/vp9_intrapred_test.cc +++ b/libvpx/test/vp9_intrapred_test.cc @@ -467,10 +467,164 @@ TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) { RunTest(left_col, above_data, dst, ref_dst); } +#if HAVE_SSSE3 +INSTANTIATE_TEST_CASE_P( + SSSE3_TO_C_8, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3, + &vpx_highbd_d45_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3, + &vpx_highbd_d45_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3, + &vpx_highbd_d45_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3, + &vpx_highbd_d45_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3, + &vpx_highbd_d63_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3, + &vpx_highbd_d63_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c, + &vpx_highbd_d63_predictor_32x32_ssse3, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3, + &vpx_highbd_d117_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3, + &vpx_highbd_d117_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c, + &vpx_highbd_d117_predictor_32x32_ssse3, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3, + &vpx_highbd_d135_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3, + &vpx_highbd_d135_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3, + &vpx_highbd_d135_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3, + &vpx_highbd_d153_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3, + &vpx_highbd_d153_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3, + &vpx_highbd_d153_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3, + &vpx_highbd_d207_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3, + &vpx_highbd_d207_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3, + &vpx_highbd_d207_predictor_32x32_c, 32, 8))); + +INSTANTIATE_TEST_CASE_P( + SSSE3_TO_C_10, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3, + &vpx_highbd_d45_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3, + &vpx_highbd_d45_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3, + &vpx_highbd_d45_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3, + &vpx_highbd_d45_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3, + &vpx_highbd_d63_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3, + &vpx_highbd_d63_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c, + &vpx_highbd_d63_predictor_32x32_ssse3, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3, + &vpx_highbd_d117_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3, + &vpx_highbd_d117_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c, + &vpx_highbd_d117_predictor_32x32_ssse3, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3, + &vpx_highbd_d135_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3, + &vpx_highbd_d135_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3, + &vpx_highbd_d135_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3, + &vpx_highbd_d153_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3, + &vpx_highbd_d153_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3, + &vpx_highbd_d153_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3, + &vpx_highbd_d207_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3, + &vpx_highbd_d207_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3, + &vpx_highbd_d207_predictor_32x32_c, 32, 10))); + +INSTANTIATE_TEST_CASE_P( + SSSE3_TO_C_12, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3, + &vpx_highbd_d45_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3, + &vpx_highbd_d45_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3, + &vpx_highbd_d45_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3, + &vpx_highbd_d45_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3, + &vpx_highbd_d63_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3, + &vpx_highbd_d63_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c, + &vpx_highbd_d63_predictor_32x32_ssse3, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3, + &vpx_highbd_d117_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3, + &vpx_highbd_d117_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c, + &vpx_highbd_d117_predictor_32x32_ssse3, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3, + &vpx_highbd_d135_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3, + &vpx_highbd_d135_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3, + &vpx_highbd_d135_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3, + &vpx_highbd_d153_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3, + &vpx_highbd_d153_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3, + &vpx_highbd_d153_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3, + &vpx_highbd_d207_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3, + &vpx_highbd_d207_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3, + &vpx_highbd_d207_predictor_32x32_c, 32, 12))); +#endif // HAVE_SSSE3 + #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P( SSE2_TO_C_8, VP9HighbdIntraPredTest, ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2, + &vpx_highbd_d63_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2, + &vpx_highbd_d117_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2, + &vpx_highbd_d135_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2, + &vpx_highbd_d153_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2, + &vpx_highbd_d207_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2, + &vpx_highbd_dc_left_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2, + &vpx_highbd_dc_left_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2, + &vpx_highbd_dc_left_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2, + &vpx_highbd_dc_left_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2, &vpx_highbd_dc_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2, @@ -479,6 +633,14 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_dc_predictor_16x16_c, 16, 8), HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2, &vpx_highbd_dc_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2, + &vpx_highbd_dc_top_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2, + &vpx_highbd_dc_top_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2, + &vpx_highbd_dc_top_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2, + &vpx_highbd_dc_top_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2, &vpx_highbd_tm_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2, @@ -487,6 +649,14 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_tm_predictor_16x16_c, 16, 8), HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2, &vpx_highbd_tm_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2, + &vpx_highbd_h_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2, + &vpx_highbd_h_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2, + &vpx_highbd_h_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2, + &vpx_highbd_h_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2, &vpx_highbd_v_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2, @@ -499,6 +669,32 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( SSE2_TO_C_10, VP9HighbdIntraPredTest, ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2, + &vpx_highbd_d63_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2, + &vpx_highbd_d117_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2, + &vpx_highbd_d135_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2, + &vpx_highbd_d153_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2, + &vpx_highbd_d207_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2, + &vpx_highbd_dc_left_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2, + &vpx_highbd_dc_left_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2, + &vpx_highbd_dc_left_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2, + &vpx_highbd_dc_left_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2, &vpx_highbd_dc_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2, @@ -507,6 +703,14 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_dc_predictor_16x16_c, 16, 10), HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2, &vpx_highbd_dc_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2, + &vpx_highbd_dc_top_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2, + &vpx_highbd_dc_top_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2, + &vpx_highbd_dc_top_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2, + &vpx_highbd_dc_top_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2, &vpx_highbd_tm_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2, @@ -515,6 +719,14 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_tm_predictor_16x16_c, 16, 10), HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2, &vpx_highbd_tm_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2, + &vpx_highbd_h_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2, + &vpx_highbd_h_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2, + &vpx_highbd_h_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2, + &vpx_highbd_h_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2, &vpx_highbd_v_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2, @@ -527,6 +739,32 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( SSE2_TO_C_12, VP9HighbdIntraPredTest, ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2, + &vpx_highbd_d63_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2, + &vpx_highbd_d117_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2, + &vpx_highbd_d135_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2, + &vpx_highbd_d153_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2, + &vpx_highbd_d207_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2, + &vpx_highbd_dc_left_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2, + &vpx_highbd_dc_left_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2, + &vpx_highbd_dc_left_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2, + &vpx_highbd_dc_left_predictor_32x32_c, 32, 12), HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2, &vpx_highbd_dc_predictor_4x4_c, 4, 12), HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2, @@ -535,6 +773,14 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_dc_predictor_16x16_c, 16, 12), HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2, &vpx_highbd_dc_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2, + &vpx_highbd_dc_top_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2, + &vpx_highbd_dc_top_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2, + &vpx_highbd_dc_top_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2, + &vpx_highbd_dc_top_predictor_32x32_c, 32, 12), HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2, &vpx_highbd_tm_predictor_4x4_c, 4, 12), HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2, @@ -543,6 +789,14 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_tm_predictor_16x16_c, 16, 12), HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2, &vpx_highbd_tm_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2, + &vpx_highbd_h_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2, + &vpx_highbd_h_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2, + &vpx_highbd_h_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2, + &vpx_highbd_h_predictor_32x32_c, 32, 12), HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2, &vpx_highbd_v_predictor_4x4_c, 4, 12), HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2, diff --git a/libvpx/test/vp9_quantize_test.cc b/libvpx/test/vp9_quantize_test.cc index 464389502..b18d4522c 100644 --- a/libvpx/test/vp9_quantize_test.cc +++ b/libvpx/test/vp9_quantize_test.cc @@ -14,9 +14,11 @@ #include "third_party/googletest/src/include/gtest/gtest.h" +#include "./vp9_rtcd.h" #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" +#include "test/buffer.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" @@ -24,11 +26,12 @@ #include "vp9/common/vp9_scan.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" +#include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; +using libvpx_test::Buffer; namespace { -#if CONFIG_VP9_HIGHBITDEPTH const int number_of_iterations = 100; typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, @@ -38,307 +41,494 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, const int16_t *scan, const int16_t *iscan); -typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t> +typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t, + int /*max_size*/, bool /*is_fp*/> QuantizeParam; -class VP9QuantizeTest : public ::testing::TestWithParam<QuantizeParam> { +// Wrapper for FP version which does not use zbin or quant_shift. +typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count, + int skip_block, const int16_t *round, + const int16_t *quant, tran_low_t *qcoeff, + tran_low_t *dqcoeff, const int16_t *dequant, + uint16_t *eob, const int16_t *scan, + const int16_t *iscan); + +template <QuantizeFPFunc fn> +void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, int skip_block, + const int16_t *zbin, const int16_t *round, + const int16_t *quant, const int16_t *quant_shift, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, const int16_t *scan, + const int16_t *iscan) { + (void)zbin; + (void)quant_shift; + + fn(coeff, count, skip_block, round, quant, qcoeff, dqcoeff, dequant, eob, + scan, iscan); +} + +class VP9QuantizeBase { public: - virtual ~VP9QuantizeTest() {} - virtual void SetUp() { - quantize_op_ = GET_PARAM(0); - ref_quantize_op_ = GET_PARAM(1); - bit_depth_ = GET_PARAM(2); - mask_ = (1 << bit_depth_) - 1; + VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp) + : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp) { + max_value_ = (1 << bit_depth_) - 1; + zbin_ptr_ = + reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_))); + round_fp_ptr_ = reinterpret_cast<int16_t *>( + vpx_memalign(16, 8 * sizeof(*round_fp_ptr_))); + quant_fp_ptr_ = reinterpret_cast<int16_t *>( + vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_))); + round_ptr_ = + reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*round_ptr_))); + quant_ptr_ = + reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*quant_ptr_))); + quant_shift_ptr_ = reinterpret_cast<int16_t *>( + vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_))); + dequant_ptr_ = reinterpret_cast<int16_t *>( + vpx_memalign(16, 8 * sizeof(*dequant_ptr_))); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + ~VP9QuantizeBase() { + vpx_free(zbin_ptr_); + vpx_free(round_fp_ptr_); + vpx_free(quant_fp_ptr_); + vpx_free(round_ptr_); + vpx_free(quant_ptr_); + vpx_free(quant_shift_ptr_); + vpx_free(dequant_ptr_); + zbin_ptr_ = NULL; + round_fp_ptr_ = NULL; + quant_fp_ptr_ = NULL; + round_ptr_ = NULL; + quant_ptr_ = NULL; + quant_shift_ptr_ = NULL; + dequant_ptr_ = NULL; + libvpx_test::ClearSystemState(); + } protected: - vpx_bit_depth_t bit_depth_; - int mask_; - QuantizeFunc quantize_op_; - QuantizeFunc ref_quantize_op_; + int16_t *zbin_ptr_; + int16_t *round_fp_ptr_; + int16_t *quant_fp_ptr_; + int16_t *round_ptr_; + int16_t *quant_ptr_; + int16_t *quant_shift_ptr_; + int16_t *dequant_ptr_; + const vpx_bit_depth_t bit_depth_; + int max_value_; + const int max_size_; + const bool is_fp_; }; -class VP9Quantize32Test : public ::testing::TestWithParam<QuantizeParam> { +class VP9QuantizeTest : public VP9QuantizeBase, + public ::testing::TestWithParam<QuantizeParam> { public: - virtual ~VP9Quantize32Test() {} - virtual void SetUp() { - quantize_op_ = GET_PARAM(0); - ref_quantize_op_ = GET_PARAM(1); - bit_depth_ = GET_PARAM(2); - mask_ = (1 << bit_depth_) - 1; - } - - virtual void TearDown() { libvpx_test::ClearSystemState(); } + VP9QuantizeTest() + : VP9QuantizeBase(GET_PARAM(2), GET_PARAM(3), GET_PARAM(4)), + quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {} protected: - vpx_bit_depth_t bit_depth_; - int mask_; - QuantizeFunc quantize_op_; - QuantizeFunc ref_quantize_op_; + const QuantizeFunc quantize_op_; + const QuantizeFunc ref_quantize_op_; }; -TEST_P(VP9QuantizeTest, OperationCheck) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]); - DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]); - DECLARE_ALIGNED(16, int16_t, round_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]); - DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]); - DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]); - DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]); - DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]); - DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]); - DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]); - DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]); - int err_count_total = 0; - int first_failure = -1; - for (int i = 0; i < number_of_iterations; ++i) { - const int skip_block = i == 0; - const TX_SIZE sz = (TX_SIZE)(i % 3); // TX_4X4, TX_8X8 TX_16X16 - const TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3); - const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; - const int count = (4 << sz) * (4 << sz); // 16, 64, 256 - int err_count = 0; - *eob_ptr = rnd.Rand16(); - *ref_eob_ptr = *eob_ptr; - for (int j = 0; j < count; j++) { - coeff_ptr[j] = rnd.Rand16() & mask_; - } - for (int j = 0; j < 2; j++) { - zbin_ptr[j] = rnd.Rand16() & mask_; - round_ptr[j] = rnd.Rand16(); - quant_ptr[j] = rnd.Rand16(); - quant_shift_ptr[j] = rnd.Rand16(); - dequant_ptr[j] = rnd.Rand16(); +// This quantizer compares the AC coefficients to the quantization step size to +// determine if further multiplication operations are needed. +// Based on vp9_quantize_fp_sse2(). +void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + int i, eob = -1; + const int thr = dequant_ptr[1] >> 1; + (void)iscan; + (void)skip_block; + assert(!skip_block); + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < n_coeffs; i += 16) { + int y; + int nzflag_cnt = 0; + int abs_coeff[16]; + int coeff_sign[16]; + + // count nzflag for each row (16 tran_low_t) + for (y = 0; y < 16; ++y) { + const int rc = i + y; + const int coeff = coeff_ptr[rc]; + coeff_sign[y] = (coeff >> 31); + abs_coeff[y] = (coeff ^ coeff_sign[y]) - coeff_sign[y]; + // The first 16 are skipped in the sse2 code. Do the same here to match. + if (i >= 16 && (abs_coeff[y] <= thr)) { + nzflag_cnt++; + } } - ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, ref_qcoeff_ptr, - ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr, - scan_order->scan, scan_order->iscan); - ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr, - quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, - scan_order->scan, scan_order->iscan)); - for (int j = 0; j < sz; ++j) { - err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) | - (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]); + + for (y = 0; y < 16; ++y) { + const int rc = i + y; + // If all of the AC coeffs in a row has magnitude less than the + // quantization step_size/2, quantize to zero. + if (nzflag_cnt < 16) { + int tmp = + clamp(abs_coeff[y] + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant_ptr[rc != 0]) >> 16; + qcoeff_ptr[rc] = (tmp ^ coeff_sign[y]) - coeff_sign[y]; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + } else { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } } - err_count += (*ref_eob_ptr != *eob_ptr); - if (err_count && !err_count_total) { - first_failure = i; + } + + // Scan for eob. + for (i = 0; i < n_coeffs; i++) { + // Use the scan order to find the correct eob. + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; } - err_count_total += err_count; } - EXPECT_EQ(0, err_count_total) - << "Error: Quantization Test, C output doesn't match SSE2 output. " - << "First failed at test case " << first_failure; + *eob_ptr = eob + 1; +} + +void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, + int16_t *quant, int16_t *quant_shift, + int16_t *dequant, int16_t *round_fp, + int16_t *quant_fp) { + // Max when q == 0. Otherwise, it is 48 for Y and 42 for U/V. + const int max_qrounding_factor_fp = 64; + + for (int j = 0; j < 2; j++) { + // The range is 4 to 1828 in the VP9 tables. + const int qlookup = rnd->RandRange(1825) + 4; + round_fp[j] = (max_qrounding_factor_fp * qlookup) >> 7; + quant_fp[j] = (1 << 16) / qlookup; + + // Values determined by deconstructing vp9_init_quantizer(). + // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y + // values or U/V values of any bit depth. This is because y_delta is not + // factored into the vp9_ac_quant() call. + zbin[j] = rnd->RandRange(1200); + + // round may be up to 685 for Y values or 914 for U/V. + round[j] = rnd->RandRange(914); + // quant ranges from 1 to -32703 + quant[j] = static_cast<int>(rnd->RandRange(32704)) - 32703; + // quant_shift goes up to 1 << 16. + quant_shift[j] = rnd->RandRange(16384); + // dequant maxes out at 1828 for all cases. + dequant[j] = rnd->RandRange(1828); + } + for (int j = 2; j < 8; j++) { + zbin[j] = zbin[1]; + round_fp[j] = round_fp[1]; + quant_fp[j] = quant_fp[1]; + round[j] = round[1]; + quant[j] = quant[1]; + quant_shift[j] = quant_shift[1]; + dequant[j] = dequant[1]; + } } -TEST_P(VP9Quantize32Test, OperationCheck) { +TEST_P(VP9QuantizeTest, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]); - DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]); - DECLARE_ALIGNED(16, int16_t, round_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]); - DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]); - DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]); - DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]); - DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]); - DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]); - DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]); - DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]); - int err_count_total = 0; - int first_failure = -1; + Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16); + ASSERT_TRUE(coeff.Init()); + Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32); + ASSERT_TRUE(qcoeff.Init()); + Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32); + ASSERT_TRUE(dqcoeff.Init()); + Buffer<tran_low_t> ref_qcoeff = + Buffer<tran_low_t>(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_qcoeff.Init()); + Buffer<tran_low_t> ref_dqcoeff = + Buffer<tran_low_t>(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_dqcoeff.Init()); + uint16_t eob, ref_eob; + for (int i = 0; i < number_of_iterations; ++i) { - const int skip_block = i == 0; - const TX_SIZE sz = TX_32X32; - const TX_TYPE tx_type = (TX_TYPE)(i % 4); - const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; - const int count = (4 << sz) * (4 << sz); // 1024 - int err_count = 0; - *eob_ptr = rnd.Rand16(); - *ref_eob_ptr = *eob_ptr; - for (int j = 0; j < count; j++) { - coeff_ptr[j] = rnd.Rand16() & mask_; - } - for (int j = 0; j < 2; j++) { - zbin_ptr[j] = rnd.Rand16() & mask_; - round_ptr[j] = rnd.Rand16(); - quant_ptr[j] = rnd.Rand16(); - quant_shift_ptr[j] = rnd.Rand16(); - dequant_ptr[j] = rnd.Rand16(); + // Test skip block for the first three iterations to catch all the different + // sizes. + const int skip_block = 0; + TX_SIZE sz; + if (max_size_ == 16) { + sz = static_cast<TX_SIZE>(i % 3); // TX_4X4, TX_8X8 TX_16X16 + } else { + sz = TX_32X32; } - ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, ref_qcoeff_ptr, - ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr, + const TX_TYPE tx_type = static_cast<TX_TYPE>((i >> 2) % 3); + const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; + const int count = (4 << sz) * (4 << sz); + coeff.Set(&rnd, -max_value_, max_value_); + GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, + quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, + quant_fp_ptr_); + int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_; + int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_; + ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, + q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), + ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, scan_order->scan, scan_order->iscan); + ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr, - quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, - scan_order->scan, scan_order->iscan)); - for (int j = 0; j < sz; ++j) { - err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) | - (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]); - } - err_count += (*ref_eob_ptr != *eob_ptr); - if (err_count && !err_count_total) { - first_failure = i; + coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr, + quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(), + dequant_ptr_, &eob, scan_order->scan, scan_order->iscan)); + + EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff)); + EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff)); + + EXPECT_EQ(eob, ref_eob); + + if (HasFailure()) { + printf("Failure on iteration %d.\n", i); + qcoeff.PrintDifference(ref_qcoeff); + dqcoeff.PrintDifference(ref_dqcoeff); + return; } - err_count_total += err_count; } - EXPECT_EQ(0, err_count_total) - << "Error: Quantization Test, C output doesn't match SSE2 output. " - << "First failed at test case " << first_failure; } TEST_P(VP9QuantizeTest, EOBCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]); - DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]); - DECLARE_ALIGNED(16, int16_t, round_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]); - DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]); - DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]); - DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]); - DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]); - DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]); - DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]); - DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]); - int err_count_total = 0; - int first_failure = -1; + Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16); + ASSERT_TRUE(coeff.Init()); + Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32); + ASSERT_TRUE(qcoeff.Init()); + Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32); + ASSERT_TRUE(dqcoeff.Init()); + Buffer<tran_low_t> ref_qcoeff = + Buffer<tran_low_t>(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_qcoeff.Init()); + Buffer<tran_low_t> ref_dqcoeff = + Buffer<tran_low_t>(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_dqcoeff.Init()); + uint16_t eob, ref_eob; + for (int i = 0; i < number_of_iterations; ++i) { - int skip_block = i == 0; - TX_SIZE sz = (TX_SIZE)(i % 3); // TX_4X4, TX_8X8 TX_16X16 - TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3); + const int skip_block = 0; + TX_SIZE sz; + if (max_size_ == 16) { + sz = static_cast<TX_SIZE>(i % 3); // TX_4X4, TX_8X8 TX_16X16 + } else { + sz = TX_32X32; + } + const TX_TYPE tx_type = static_cast<TX_TYPE>((i >> 2) % 3); const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; - int count = (4 << sz) * (4 << sz); // 16, 64, 256 - int err_count = 0; - *eob_ptr = rnd.Rand16(); - *ref_eob_ptr = *eob_ptr; + int count = (4 << sz) * (4 << sz); // Two random entries - for (int j = 0; j < count; j++) { - coeff_ptr[j] = 0; - } - coeff_ptr[rnd(count)] = rnd.Rand16() & mask_; - coeff_ptr[rnd(count)] = rnd.Rand16() & mask_; - for (int j = 0; j < 2; j++) { - zbin_ptr[j] = rnd.Rand16() & mask_; - round_ptr[j] = rnd.Rand16(); - quant_ptr[j] = rnd.Rand16(); - quant_shift_ptr[j] = rnd.Rand16(); - dequant_ptr[j] = rnd.Rand16(); - } - - ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, ref_qcoeff_ptr, - ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr, + coeff.Set(0); + coeff.TopLeftPixel()[rnd(count)] = + static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_; + coeff.TopLeftPixel()[rnd(count)] = + static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_; + GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, + quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, + quant_fp_ptr_); + int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_; + int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_; + ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, + q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), + ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, scan_order->scan, scan_order->iscan); + ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr, - quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, - scan_order->scan, scan_order->iscan)); + coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr, + quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(), + dequant_ptr_, &eob, scan_order->scan, scan_order->iscan)); - for (int j = 0; j < sz; ++j) { - err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) | - (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]); - } - err_count += (*ref_eob_ptr != *eob_ptr); - if (err_count && !err_count_total) { - first_failure = i; + EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff)); + EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff)); + + EXPECT_EQ(eob, ref_eob); + + if (HasFailure()) { + printf("Failure on iteration %d.\n", i); + qcoeff.PrintDifference(ref_qcoeff); + dqcoeff.PrintDifference(ref_dqcoeff); + return; } - err_count_total += err_count; } - EXPECT_EQ(0, err_count_total) - << "Error: Quantization Test, C output doesn't match SSE2 output. " - << "First failed at test case " << first_failure; } -TEST_P(VP9Quantize32Test, EOBCheck) { +TEST_P(VP9QuantizeTest, DISABLED_Speed) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]); - DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]); - DECLARE_ALIGNED(16, int16_t, round_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]); - DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]); - DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]); - DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]); - DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]); - DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]); - DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]); - DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]); - int err_count_total = 0; - int first_failure = -1; - for (int i = 0; i < number_of_iterations; ++i) { - int skip_block = i == 0; - TX_SIZE sz = TX_32X32; - TX_TYPE tx_type = (TX_TYPE)(i % 4); - const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; - int count = (4 << sz) * (4 << sz); // 1024 - int err_count = 0; - *eob_ptr = rnd.Rand16(); - *ref_eob_ptr = *eob_ptr; - for (int j = 0; j < count; j++) { - coeff_ptr[j] = 0; - } - // Two random entries - coeff_ptr[rnd(count)] = rnd.Rand16() & mask_; - coeff_ptr[rnd(count)] = rnd.Rand16() & mask_; - for (int j = 0; j < 2; j++) { - zbin_ptr[j] = rnd.Rand16() & mask_; - round_ptr[j] = rnd.Rand16(); - quant_ptr[j] = rnd.Rand16(); - quant_shift_ptr[j] = rnd.Rand16(); - dequant_ptr[j] = rnd.Rand16(); - } + Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16); + ASSERT_TRUE(coeff.Init()); + Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32); + ASSERT_TRUE(qcoeff.Init()); + Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32); + ASSERT_TRUE(dqcoeff.Init()); + uint16_t eob; + TX_SIZE starting_sz, ending_sz; - ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, ref_qcoeff_ptr, - ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr, - scan_order->scan, scan_order->iscan); - ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr, - quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, - scan_order->scan, scan_order->iscan)); + if (max_size_ == 16) { + starting_sz = TX_4X4; + ending_sz = TX_16X16; + } else { + starting_sz = TX_32X32; + ending_sz = TX_32X32; + } - for (int j = 0; j < sz; ++j) { - err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) | - (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]); - } - err_count += (*ref_eob_ptr != *eob_ptr); - if (err_count && !err_count_total) { - first_failure = i; + for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) { + // zbin > coeff, zbin < coeff. + for (int i = 0; i < 2; ++i) { + const int skip_block = 0; + // TX_TYPE defines the scan order. That is not relevant to the speed test. + // Pick the first one. + const TX_TYPE tx_type = DCT_DCT; + const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; + const int count = (4 << sz) * (4 << sz); + + GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, + quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, + quant_fp_ptr_); + int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_; + int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_; + + if (i == 0) { + // When |coeff values| are less than zbin the results are 0. + int threshold = 100; + if (max_size_ == 32) { + // For 32x32, the threshold is halved. Double it to keep the values + // from clearing it. + threshold = 200; + } + for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold; + coeff.Set(&rnd, -99, 99); + } else if (i == 1) { + for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50; + coeff.Set(&rnd, -500, 500); + } + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int j = 0; j < 100000000 / count; ++j) { + quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, + q_ptr, quant_shift_ptr_, qcoeff.TopLeftPixel(), + dqcoeff.TopLeftPixel(), dequant_ptr_, &eob, + scan_order->scan, scan_order->iscan); + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer)); + if (i == 0) printf("Bypass calculations.\n"); + if (i == 1) printf("Full calculations.\n"); + printf("Quantize %dx%d time: %5d ms\n", 4 << sz, 4 << sz, + elapsed_time / 1000); } - err_count_total += err_count; + printf("\n"); } - EXPECT_EQ(0, err_count_total) - << "Error: Quantization Test, C output doesn't match SSE2 output. " - << "First failed at test case " << first_failure; } + using std::tr1::make_tuple; #if HAVE_SSE2 +#if CONFIG_VP9_HIGHBITDEPTH +// TODO(johannkoenig): Fix vpx_quantize_b_sse2 in highbitdepth builds. +// make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c, VPX_BITS_8), INSTANTIATE_TEST_CASE_P( SSE2, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_highbd_quantize_b_sse2, - &vpx_highbd_quantize_b_c, VPX_BITS_8), - make_tuple(&vpx_highbd_quantize_b_sse2, - &vpx_highbd_quantize_b_c, VPX_BITS_10), - make_tuple(&vpx_highbd_quantize_b_sse2, - &vpx_highbd_quantize_b_c, VPX_BITS_12))); + ::testing::Values( + make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false), + make_tuple(&vpx_highbd_quantize_b_32x32_sse2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_sse2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_sse2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); + +#else INSTANTIATE_TEST_CASE_P( - SSE2, VP9Quantize32Test, - ::testing::Values(make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12))); -#endif // HAVE_SSE2 + SSE2, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>, + &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, + 16, true))); #endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH +#if ARCH_X86_64 +INSTANTIATE_TEST_CASE_P( + SSSE3, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>, + &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, + 16, true))); +#else +INSTANTIATE_TEST_CASE_P(SSSE3, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, + &vpx_quantize_b_c, + VPX_BITS_8, 16, false))); +#endif + +#if ARCH_X86_64 +// TODO(johannkoenig): SSSE3 optimizations do not yet pass this test. +INSTANTIATE_TEST_CASE_P( + DISABLED_SSSE3, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_32x32_ssse3, + &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, + false), + make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>, + &QuantFPWrapper<vp9_quantize_fp_32x32_c>, + VPX_BITS_8, 32, true))); +#endif // ARCH_X86_64 +#endif // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH + +// TODO(johannkoenig): AVX optimizations do not yet pass the 32x32 test or +// highbitdepth configurations. +#if HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + AVX, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + // Even though SSSE3 and AVX do not match the reference + // code, we can keep them in sync with each other. + make_tuple(&vpx_quantize_b_32x32_avx, + &vpx_quantize_b_32x32_ssse3, VPX_BITS_8, 32, + false))); +#endif // HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH + +// TODO(webm:1448): dqcoeff is not handled correctly in HBD builds. +#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + NEON, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_neon, + &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, + false), + make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>, + &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, + 16, true), + make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>, + &QuantFPWrapper<vp9_quantize_fp_32x32_c>, + VPX_BITS_8, 32, true))); +#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH + +// Only useful to compare "Speed" test results. +INSTANTIATE_TEST_CASE_P( + DISABLED_C, VP9QuantizeTest, + ::testing::Values( + make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8, + 32, false), + make_tuple(&QuantFPWrapper<vp9_quantize_fp_c>, + &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper<quantize_fp_nz_c>, + &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_c>, + &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32, + true))); } // namespace diff --git a/libvpx/test/vp9_scale_test.cc b/libvpx/test/vp9_scale_test.cc new file mode 100644 index 000000000..5d7d38e89 --- /dev/null +++ b/libvpx/test/vp9_scale_test.cc @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> +#include <string.h> + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "./vpx_scale_rtcd.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/vpx_scale_test.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_timer.h" +#include "vpx_scale/yv12config.h" + +namespace libvpx_test { + +typedef void (*ScaleFrameFunc)(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + INTERP_FILTER filter_type, int phase_scaler); + +class ScaleTest : public VpxScaleBase, + public ::testing::TestWithParam<ScaleFrameFunc> { + public: + virtual ~ScaleTest() {} + + protected: + virtual void SetUp() { scale_fn_ = GetParam(); } + + void ReferenceScaleFrame(INTERP_FILTER filter_type, int phase_scaler) { + vp9_scale_and_extend_frame_c(&img_, &ref_img_, filter_type, phase_scaler); + } + + void ScaleFrame(INTERP_FILTER filter_type, int phase_scaler) { + ASM_REGISTER_STATE_CHECK( + scale_fn_(&img_, &dst_img_, filter_type, phase_scaler)); + } + + void RunTest() { + static const int kNumSizesToTest = 20; + static const int kNumScaleFactorsToTest = 4; + static const int kSizesToTest[] = { + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, + 22, 24, 26, 28, 30, 32, 34, 68, 128, 134 + }; + static const int kScaleFactors[] = { 1, 2, 3, 4 }; + for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) { + for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) { + for (int h = 0; h < kNumSizesToTest; ++h) { + const int src_height = kSizesToTest[h]; + for (int w = 0; w < kNumSizesToTest; ++w) { + const int src_width = kSizesToTest[w]; + for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest; + ++sf_up_idx) { + const int sf_up = kScaleFactors[sf_up_idx]; + for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest; + ++sf_down_idx) { + const int sf_down = kScaleFactors[sf_down_idx]; + const int dst_width = src_width * sf_up / sf_down; + const int dst_height = src_height * sf_up / sf_down; + if (sf_up == sf_down && sf_up != 1) { + continue; + } + // I420 frame width and height must be even. + if (!dst_width || !dst_height || dst_width & 1 || + dst_height & 1) { + continue; + } + // vpx_convolve8_c() has restriction on the step which cannot + // exceed 64 (ratio 1 to 4). + if (src_width > 4 * dst_width || src_height > 4 * dst_height) { + continue; + } + ASSERT_NO_FATAL_FAILURE(ResetScaleImages( + src_width, src_height, dst_width, dst_height)); + ReferenceScaleFrame(filter_type, phase_scaler); + ScaleFrame(filter_type, phase_scaler); + if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc, + ref_img_.frame_size)) { + printf( + "filter_type = %d, phase_scaler = %d, src_width = %4d, " + "src_height = %4d, dst_width = %4d, dst_height = %4d, " + "scale factor = %d:%d\n", + filter_type, phase_scaler, src_width, src_height, + dst_width, dst_height, sf_down, sf_up); + PrintDiff(); + } + CompareImages(dst_img_); + DeallocScaleImages(); + } + } + } + } + } + } + } + + void PrintDiffComponent(const uint8_t *const ref, const uint8_t *const opt, + const int stride, const int width, const int height, + const int plane_idx) const { + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + if (ref[y * stride + x] != opt[y * stride + x]) { + printf("Plane %d pixel[%d][%d] diff:%6d (ref),%6d (opt)\n", plane_idx, + y, x, ref[y * stride + x], opt[y * stride + x]); + break; + } + } + } + } + + void PrintDiff() const { + assert(ref_img_.y_stride == dst_img_.y_stride); + assert(ref_img_.y_width == dst_img_.y_width); + assert(ref_img_.y_height == dst_img_.y_height); + assert(ref_img_.uv_stride == dst_img_.uv_stride); + assert(ref_img_.uv_width == dst_img_.uv_width); + assert(ref_img_.uv_height == dst_img_.uv_height); + + if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc, + ref_img_.frame_size)) { + PrintDiffComponent(ref_img_.y_buffer, dst_img_.y_buffer, + ref_img_.y_stride, ref_img_.y_width, ref_img_.y_height, + 0); + PrintDiffComponent(ref_img_.u_buffer, dst_img_.u_buffer, + ref_img_.uv_stride, ref_img_.uv_width, + ref_img_.uv_height, 1); + PrintDiffComponent(ref_img_.v_buffer, dst_img_.v_buffer, + ref_img_.uv_stride, ref_img_.uv_width, + ref_img_.uv_height, 2); + } + } + + ScaleFrameFunc scale_fn_; +}; + +TEST_P(ScaleTest, ScaleFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); } + +TEST_P(ScaleTest, DISABLED_Speed) { + static const int kCountSpeedTestBlock = 100; + static const int kNumScaleFactorsToTest = 4; + static const int kScaleFactors[] = { 1, 2, 3, 4 }; + const int src_width = 1280; + const int src_height = 720; + for (INTERP_FILTER filter_type = 2; filter_type < 4; ++filter_type) { + for (int phase_scaler = 0; phase_scaler < 2; ++phase_scaler) { + for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest; ++sf_up_idx) { + const int sf_up = kScaleFactors[sf_up_idx]; + for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest; + ++sf_down_idx) { + const int sf_down = kScaleFactors[sf_down_idx]; + const int dst_width = src_width * sf_up / sf_down; + const int dst_height = src_height * sf_up / sf_down; + if (sf_up == sf_down && sf_up != 1) { + continue; + } + // I420 frame width and height must be even. + if (dst_width & 1 || dst_height & 1) { + continue; + } + ASSERT_NO_FATAL_FAILURE( + ResetScaleImages(src_width, src_height, dst_width, dst_height)); + ASM_REGISTER_STATE_CHECK( + ReferenceScaleFrame(filter_type, phase_scaler)); + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + ScaleFrame(filter_type, phase_scaler); + } + libvpx_test::ClearSystemState(); + vpx_usec_timer_mark(&timer); + const int elapsed_time = + static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000); + CompareImages(dst_img_); + DeallocScaleImages(); + + printf( + "filter_type = %d, phase_scaler = %d, src_width = %4d, " + "src_height = %4d, dst_width = %4d, dst_height = %4d, " + "scale factor = %d:%d, scale time: %5d ms\n", + filter_type, phase_scaler, src_width, src_height, dst_width, + dst_height, sf_down, sf_up, elapsed_time); + } + } + } + } +} + +INSTANTIATE_TEST_CASE_P(C, ScaleTest, + ::testing::Values(vp9_scale_and_extend_frame_c)); + +#if HAVE_SSSE3 +INSTANTIATE_TEST_CASE_P(SSSE3, ScaleTest, + ::testing::Values(vp9_scale_and_extend_frame_ssse3)); +#endif // HAVE_SSSE3 + +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P(NEON, ScaleTest, + ::testing::Values(vp9_scale_and_extend_frame_neon)); +#endif // HAVE_NEON + +} // namespace libvpx_test diff --git a/libvpx/test/vp9_skip_loopfilter_test.cc b/libvpx/test/vp9_skip_loopfilter_test.cc index e847bbddf..d41a784a2 100644 --- a/libvpx/test/vp9_skip_loopfilter_test.cc +++ b/libvpx/test/vp9_skip_loopfilter_test.cc @@ -85,8 +85,8 @@ class SkipLoopFilterTest { // TODO(fgalligan): Move the MD5 testing code into another class. void OpenMd5File(const std::string &md5_file_name) { md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name); - ASSERT_TRUE(md5_file_ != NULL) << "MD5 file open failed. Filename: " - << md5_file_name; + ASSERT_TRUE(md5_file_ != NULL) + << "MD5 file open failed. Filename: " << md5_file_name; } // Reads the next line of the MD5 file. diff --git a/libvpx/test/vp9_subtract_test.cc b/libvpx/test/vp9_subtract_test.cc index 19ed30431..62845ad61 100644 --- a/libvpx/test/vp9_subtract_test.cc +++ b/libvpx/test/vp9_subtract_test.cc @@ -101,4 +101,9 @@ INSTANTIATE_TEST_CASE_P(MSA, VP9SubtractBlockTest, ::testing::Values(vpx_subtract_block_msa)); #endif +#if HAVE_MMI +INSTANTIATE_TEST_CASE_P(MMI, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_mmi)); +#endif + } // namespace vp9 diff --git a/libvpx/test/vp9_thread_test.cc b/libvpx/test/vp9_thread_test.cc index 3e3fd25ac..576f5e906 100644 --- a/libvpx/test/vp9_thread_test.cc +++ b/libvpx/test/vp9_thread_test.cc @@ -187,8 +187,8 @@ void DecodeFiles(const FileList files[]) { for (const FileList *iter = files; iter->name != NULL; ++iter) { SCOPED_TRACE(iter->name); for (int t = 1; t <= 8; ++t) { - EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t)) << "threads = " - << t; + EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t)) + << "threads = " << t; } } } diff --git a/libvpx/test/vpx_scale_test.cc b/libvpx/test/vpx_scale_test.cc index 9701d93da..ac75dceb2 100644 --- a/libvpx/test/vpx_scale_test.cc +++ b/libvpx/test/vpx_scale_test.cc @@ -14,149 +14,17 @@ #include "./vpx_scale_rtcd.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" +#include "test/vpx_scale_test.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_timer.h" #include "vpx_scale/yv12config.h" -namespace { +namespace libvpx_test { typedef void (*ExtendFrameBorderFunc)(YV12_BUFFER_CONFIG *ybf); typedef void (*CopyFrameFunc)(const YV12_BUFFER_CONFIG *src_ybf, YV12_BUFFER_CONFIG *dst_ybf); -class VpxScaleBase { - public: - virtual ~VpxScaleBase() { libvpx_test::ClearSystemState(); } - - void ResetImage(int width, int height) { - width_ = width; - height_ = height; - memset(&img_, 0, sizeof(img_)); - ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&img_, width_, height_, - VP8BORDERINPIXELS)); - memset(img_.buffer_alloc, kBufFiller, img_.frame_size); - FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height, - img_.y_stride); - FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height, - img_.uv_stride); - FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height, - img_.uv_stride); - - memset(&ref_img_, 0, sizeof(ref_img_)); - ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&ref_img_, width_, height_, - VP8BORDERINPIXELS)); - memset(ref_img_.buffer_alloc, kBufFiller, ref_img_.frame_size); - - memset(&cpy_img_, 0, sizeof(cpy_img_)); - ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&cpy_img_, width_, height_, - VP8BORDERINPIXELS)); - memset(cpy_img_.buffer_alloc, kBufFiller, cpy_img_.frame_size); - ReferenceCopyFrame(); - } - - void DeallocImage() { - vp8_yv12_de_alloc_frame_buffer(&img_); - vp8_yv12_de_alloc_frame_buffer(&ref_img_); - vp8_yv12_de_alloc_frame_buffer(&cpy_img_); - } - - protected: - static const int kBufFiller = 123; - static const int kBufMax = kBufFiller - 1; - - static void FillPlane(uint8_t *buf, int width, int height, int stride) { - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - buf[x + (y * stride)] = (x + (width * y)) % kBufMax; - } - } - } - - static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height, - int width, int height, int stride, int padding) { - // Copy the outermost visible pixel to a distance of at least 'padding.' - // The buffers are allocated such that there may be excess space outside the - // padding. As long as the minimum amount of padding is achieved it is not - // necessary to fill this space as well. - uint8_t *left = buf - padding; - uint8_t *right = buf + crop_width; - const int right_extend = padding + (width - crop_width); - const int bottom_extend = padding + (height - crop_height); - - // Fill the border pixels from the nearest image pixel. - for (int y = 0; y < crop_height; ++y) { - memset(left, left[padding], padding); - memset(right, right[-1], right_extend); - left += stride; - right += stride; - } - - left = buf - padding; - uint8_t *top = left - (stride * padding); - // The buffer does not always extend as far as the stride. - // Equivalent to padding + width + padding. - const int extend_width = padding + crop_width + right_extend; - - // The first row was already extended to the left and right. Copy it up. - for (int y = 0; y < padding; ++y) { - memcpy(top, left, extend_width); - top += stride; - } - - uint8_t *bottom = left + (crop_height * stride); - for (int y = 0; y < bottom_extend; ++y) { - memcpy(bottom, left + (crop_height - 1) * stride, extend_width); - bottom += stride; - } - } - - void ReferenceExtendBorder() { - ExtendPlane(ref_img_.y_buffer, ref_img_.y_crop_width, - ref_img_.y_crop_height, ref_img_.y_width, ref_img_.y_height, - ref_img_.y_stride, ref_img_.border); - ExtendPlane(ref_img_.u_buffer, ref_img_.uv_crop_width, - ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height, - ref_img_.uv_stride, ref_img_.border / 2); - ExtendPlane(ref_img_.v_buffer, ref_img_.uv_crop_width, - ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height, - ref_img_.uv_stride, ref_img_.border / 2); - } - - void ReferenceCopyFrame() { - // Copy img_ to ref_img_ and extend frame borders. This will be used for - // verifying extend_fn_ as well as copy_frame_fn_. - EXPECT_EQ(ref_img_.frame_size, img_.frame_size); - for (int y = 0; y < img_.y_crop_height; ++y) { - for (int x = 0; x < img_.y_crop_width; ++x) { - ref_img_.y_buffer[x + y * ref_img_.y_stride] = - img_.y_buffer[x + y * img_.y_stride]; - } - } - - for (int y = 0; y < img_.uv_crop_height; ++y) { - for (int x = 0; x < img_.uv_crop_width; ++x) { - ref_img_.u_buffer[x + y * ref_img_.uv_stride] = - img_.u_buffer[x + y * img_.uv_stride]; - ref_img_.v_buffer[x + y * ref_img_.uv_stride] = - img_.v_buffer[x + y * img_.uv_stride]; - } - } - - ReferenceExtendBorder(); - } - - void CompareImages(const YV12_BUFFER_CONFIG actual) { - EXPECT_EQ(ref_img_.frame_size, actual.frame_size); - EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc, - ref_img_.frame_size)); - } - - YV12_BUFFER_CONFIG img_; - YV12_BUFFER_CONFIG ref_img_; - YV12_BUFFER_CONFIG cpy_img_; - int width_; - int height_; -}; - class ExtendBorderTest : public VpxScaleBase, public ::testing::TestWithParam<ExtendFrameBorderFunc> { @@ -178,11 +46,11 @@ class ExtendBorderTest static const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 16383 }; for (int h = 0; h < kNumSizesToTest; ++h) { for (int w = 0; w < kNumSizesToTest; ++w) { - ASSERT_NO_FATAL_FAILURE(ResetImage(kSizesToTest[w], kSizesToTest[h])); + ASSERT_NO_FATAL_FAILURE(ResetImages(kSizesToTest[w], kSizesToTest[h])); + ReferenceCopyFrame(); ExtendBorder(); - ReferenceExtendBorder(); CompareImages(img_); - DeallocImage(); + DeallocImages(); } } } @@ -204,7 +72,7 @@ class CopyFrameTest : public VpxScaleBase, virtual void SetUp() { copy_frame_fn_ = GetParam(); } void CopyFrame() { - ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &cpy_img_)); + ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &dst_img_)); } void RunTest() { @@ -217,11 +85,11 @@ class CopyFrameTest : public VpxScaleBase, static const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 16383 }; for (int h = 0; h < kNumSizesToTest; ++h) { for (int w = 0; w < kNumSizesToTest; ++w) { - ASSERT_NO_FATAL_FAILURE(ResetImage(kSizesToTest[w], kSizesToTest[h])); + ASSERT_NO_FATAL_FAILURE(ResetImages(kSizesToTest[w], kSizesToTest[h])); ReferenceCopyFrame(); CopyFrame(); - CompareImages(cpy_img_); - DeallocImage(); + CompareImages(dst_img_); + DeallocImages(); } } } @@ -233,4 +101,5 @@ TEST_P(CopyFrameTest, CopyFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); } INSTANTIATE_TEST_CASE_P(C, CopyFrameTest, ::testing::Values(vp8_yv12_copy_frame_c)); -} // namespace + +} // namespace libvpx_test diff --git a/libvpx/test/vpx_scale_test.h b/libvpx/test/vpx_scale_test.h new file mode 100644 index 000000000..dcbd02b91 --- /dev/null +++ b/libvpx/test/vpx_scale_test.h @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef TEST_VPX_SCALE_TEST_H_ +#define TEST_VPX_SCALE_TEST_H_ + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#include "./vpx_scale_rtcd.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_scale/yv12config.h" + +using libvpx_test::ACMRandom; + +namespace libvpx_test { + +class VpxScaleBase { + public: + virtual ~VpxScaleBase() { libvpx_test::ClearSystemState(); } + + void ResetImage(YV12_BUFFER_CONFIG *const img, const int width, + const int height) { + memset(img, 0, sizeof(*img)); + ASSERT_EQ( + 0, vp8_yv12_alloc_frame_buffer(img, width, height, VP8BORDERINPIXELS)); + memset(img->buffer_alloc, kBufFiller, img->frame_size); + } + + void ResetImages(const int width, const int height) { + ResetImage(&img_, width, height); + ResetImage(&ref_img_, width, height); + ResetImage(&dst_img_, width, height); + + FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height, + img_.y_stride); + FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); + FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); + } + + void ResetScaleImage(YV12_BUFFER_CONFIG *const img, const int width, + const int height) { + memset(img, 0, sizeof(*img)); +#if CONFIG_VP9_HIGHBITDEPTH + ASSERT_EQ(0, vpx_alloc_frame_buffer(img, width, height, 1, 1, 0, + VP9_ENC_BORDER_IN_PIXELS, 0)); +#else + ASSERT_EQ(0, vpx_alloc_frame_buffer(img, width, height, 1, 1, + VP9_ENC_BORDER_IN_PIXELS, 0)); +#endif + memset(img->buffer_alloc, kBufFiller, img->frame_size); + } + + void ResetScaleImages(const int src_width, const int src_height, + const int dst_width, const int dst_height) { + ResetScaleImage(&img_, src_width, src_height); + ResetScaleImage(&ref_img_, dst_width, dst_height); + ResetScaleImage(&dst_img_, dst_width, dst_height); + FillPlaneExtreme(img_.y_buffer, img_.y_crop_width, img_.y_crop_height, + img_.y_stride); + FillPlaneExtreme(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); + FillPlaneExtreme(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); + } + + void DeallocImages() { + vp8_yv12_de_alloc_frame_buffer(&img_); + vp8_yv12_de_alloc_frame_buffer(&ref_img_); + vp8_yv12_de_alloc_frame_buffer(&dst_img_); + } + + void DeallocScaleImages() { + vpx_free_frame_buffer(&img_); + vpx_free_frame_buffer(&ref_img_); + vpx_free_frame_buffer(&dst_img_); + } + + protected: + static const int kBufFiller = 123; + static const int kBufMax = kBufFiller - 1; + + static void FillPlane(uint8_t *const buf, const int width, const int height, + const int stride) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + buf[x + (y * stride)] = (x + (width * y)) % kBufMax; + } + } + } + + static void FillPlaneExtreme(uint8_t *const buf, const int width, + const int height, const int stride) { + ACMRandom rnd; + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + buf[x + (y * stride)] = rnd.Rand8() % 2 ? 255 : 0; + } + } + } + + static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height, + int width, int height, int stride, int padding) { + // Copy the outermost visible pixel to a distance of at least 'padding.' + // The buffers are allocated such that there may be excess space outside the + // padding. As long as the minimum amount of padding is achieved it is not + // necessary to fill this space as well. + uint8_t *left = buf - padding; + uint8_t *right = buf + crop_width; + const int right_extend = padding + (width - crop_width); + const int bottom_extend = padding + (height - crop_height); + + // Fill the border pixels from the nearest image pixel. + for (int y = 0; y < crop_height; ++y) { + memset(left, left[padding], padding); + memset(right, right[-1], right_extend); + left += stride; + right += stride; + } + + left = buf - padding; + uint8_t *top = left - (stride * padding); + // The buffer does not always extend as far as the stride. + // Equivalent to padding + width + padding. + const int extend_width = padding + crop_width + right_extend; + + // The first row was already extended to the left and right. Copy it up. + for (int y = 0; y < padding; ++y) { + memcpy(top, left, extend_width); + top += stride; + } + + uint8_t *bottom = left + (crop_height * stride); + for (int y = 0; y < bottom_extend; ++y) { + memcpy(bottom, left + (crop_height - 1) * stride, extend_width); + bottom += stride; + } + } + + void ReferenceExtendBorder() { + ExtendPlane(ref_img_.y_buffer, ref_img_.y_crop_width, + ref_img_.y_crop_height, ref_img_.y_width, ref_img_.y_height, + ref_img_.y_stride, ref_img_.border); + ExtendPlane(ref_img_.u_buffer, ref_img_.uv_crop_width, + ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height, + ref_img_.uv_stride, ref_img_.border / 2); + ExtendPlane(ref_img_.v_buffer, ref_img_.uv_crop_width, + ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height, + ref_img_.uv_stride, ref_img_.border / 2); + } + + void ReferenceCopyFrame() { + // Copy img_ to ref_img_ and extend frame borders. This will be used for + // verifying extend_fn_ as well as copy_frame_fn_. + EXPECT_EQ(ref_img_.frame_size, img_.frame_size); + for (int y = 0; y < img_.y_crop_height; ++y) { + for (int x = 0; x < img_.y_crop_width; ++x) { + ref_img_.y_buffer[x + y * ref_img_.y_stride] = + img_.y_buffer[x + y * img_.y_stride]; + } + } + + for (int y = 0; y < img_.uv_crop_height; ++y) { + for (int x = 0; x < img_.uv_crop_width; ++x) { + ref_img_.u_buffer[x + y * ref_img_.uv_stride] = + img_.u_buffer[x + y * img_.uv_stride]; + ref_img_.v_buffer[x + y * ref_img_.uv_stride] = + img_.v_buffer[x + y * img_.uv_stride]; + } + } + + ReferenceExtendBorder(); + } + + void CompareImages(const YV12_BUFFER_CONFIG actual) { + EXPECT_EQ(ref_img_.frame_size, actual.frame_size); + EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc, + ref_img_.frame_size)); + } + + YV12_BUFFER_CONFIG img_; + YV12_BUFFER_CONFIG ref_img_; + YV12_BUFFER_CONFIG dst_img_; +}; + +} // namespace libvpx_test + +#endif // TEST_VPX_SCALE_TEST_H_ diff --git a/libvpx/test/vpx_temporal_svc_encoder.sh b/libvpx/test/vpx_temporal_svc_encoder.sh index 3d5152ae3..56a7902f4 100755 --- a/libvpx/test/vpx_temporal_svc_encoder.sh +++ b/libvpx/test/vpx_temporal_svc_encoder.sh @@ -52,11 +52,19 @@ vpx_tsvc_encoder() { # TODO(tomfinegan): Verify file output for all thread runs. for threads in $(seq $max_threads); do - eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" "${output_file}" \ - "${codec}" "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \ - "${timebase_num}" "${timebase_den}" "${speed}" "${frame_drop_thresh}" \ - "${error_resilient}" "${threads}" "$@" \ - ${devnull} + if [ "$(vpx_config_option_enabled CONFIG_VP9_HIGHBITDEPTH)" != "yes" ]; then + eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \ + "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \ + "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \ + "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \ + "$@" ${devnull} + else + eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \ + "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \ + "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \ + "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \ + "$@" "8" ${devnull} + fi done } diff --git a/libvpx/test/vpxenc.sh b/libvpx/test/vpxenc.sh index e8994992a..0c160dafc 100755 --- a/libvpx/test/vpxenc.sh +++ b/libvpx/test/vpxenc.sh @@ -90,6 +90,15 @@ vpxenc_rt_params() { --undershoot-pct=50" } +# Forces --passes to 1 with CONFIG_REALTIME_ONLY. +vpxenc_passes_param() { + if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" = "yes" ]; then + echo "--passes=1" + else + echo "--passes=2" + fi +} + # Wrapper function for running vpxenc with pipe input. Requires that # LIBVPX_BIN_PATH points to the directory containing vpxenc. $1 is used as the # input file path and shifted away. All remaining parameters are passed through @@ -218,9 +227,11 @@ vpxenc_vp8_ivf_piped_input() { vpxenc_vp9_ivf() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf" + local readonly passes=$(vpxenc_passes_param) vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ + "${passes}" \ --ivf \ --output="${output}" @@ -235,9 +246,11 @@ vpxenc_vp9_webm() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm" + local readonly passes=$(vpxenc_passes_param) vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ + "${passes}" \ --output="${output}" if [ ! -e "${output}" ]; then @@ -339,11 +352,13 @@ vpxenc_vp9_webm_2pass() { vpxenc_vp9_ivf_lossless() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf" + local readonly passes=$(vpxenc_passes_param) vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ --ivf \ --output="${output}" \ + "${passes}" \ --lossless=1 if [ ! -e "${output}" ]; then @@ -356,11 +371,13 @@ vpxenc_vp9_ivf_lossless() { vpxenc_vp9_ivf_minq0_maxq0() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf" + local readonly passes=$(vpxenc_passes_param) vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ --ivf \ --output="${output}" \ + "${passes}" \ --min-q=0 \ --max-q=0 @@ -377,12 +394,13 @@ vpxenc_vp9_webm_lag10_frames20() { local readonly lag_total_frames=20 local readonly lag_frames=10 local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm" + local readonly passes=$(vpxenc_passes_param) vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${lag_total_frames}" \ --lag-in-frames="${lag_frames}" \ --output="${output}" \ - --passes=2 \ + "${passes}" \ --auto-alt-ref=1 if [ ! -e "${output}" ]; then @@ -397,9 +415,11 @@ vpxenc_vp9_webm_non_square_par() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_non_square_par.webm" + local readonly passes=$(vpxenc_passes_param) vpxenc $(y4m_input_non_square_par) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ + "${passes}" \ --output="${output}" if [ ! -e "${output}" ]; then @@ -412,18 +432,21 @@ vpxenc_vp9_webm_non_square_par() { vpxenc_tests="vpxenc_vp8_ivf vpxenc_vp8_webm vpxenc_vp8_webm_rt - vpxenc_vp8_webm_2pass - vpxenc_vp8_webm_lag10_frames20 vpxenc_vp8_ivf_piped_input vpxenc_vp9_ivf vpxenc_vp9_webm vpxenc_vp9_webm_rt vpxenc_vp9_webm_rt_multithread_tiled vpxenc_vp9_webm_rt_multithread_tiled_frameparallel - vpxenc_vp9_webm_2pass vpxenc_vp9_ivf_lossless vpxenc_vp9_ivf_minq0_maxq0 vpxenc_vp9_webm_lag10_frames20 vpxenc_vp9_webm_non_square_par" +if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" != "yes" ]; then + vpxenc_tests="$vpxenc_tests + vpxenc_vp8_webm_2pass + vpxenc_vp8_webm_lag10_frames20 + vpxenc_vp9_webm_2pass" +fi run_tests vpxenc_verify_environment "${vpxenc_tests}" diff --git a/libvpx/test/webm_video_source.h b/libvpx/test/webm_video_source.h index 53713618e..09c007a3f 100644 --- a/libvpx/test/webm_video_source.h +++ b/libvpx/test/webm_video_source.h @@ -40,8 +40,8 @@ class WebMVideoSource : public CompressedVideoSource { virtual void Begin() { vpx_ctx_->file = OpenTestDataFile(file_name_); - ASSERT_TRUE(vpx_ctx_->file != NULL) << "Input file open failed. Filename: " - << file_name_; + ASSERT_TRUE(vpx_ctx_->file != NULL) + << "Input file open failed. Filename: " << file_name_; ASSERT_EQ(file_is_webm(webm_ctx_, vpx_ctx_), 1) << "file is not WebM"; diff --git a/libvpx/test/y4m_video_source.h b/libvpx/test/y4m_video_source.h index 2682ddde3..1301f6970 100644 --- a/libvpx/test/y4m_video_source.h +++ b/libvpx/test/y4m_video_source.h @@ -34,8 +34,8 @@ class Y4mVideoSource : public VideoSource { virtual void OpenSource() { CloseSource(); input_file_ = OpenTestDataFile(file_name_); - ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: " - << file_name_; + ASSERT_TRUE(input_file_ != NULL) + << "Input file open failed. Filename: " << file_name_; } virtual void ReadSourceToStart() { diff --git a/libvpx/test/yuv_video_source.h b/libvpx/test/yuv_video_source.h index 71ad2ab9a..aee6b2ffb 100644 --- a/libvpx/test/yuv_video_source.h +++ b/libvpx/test/yuv_video_source.h @@ -43,8 +43,8 @@ class YUVVideoSource : public VideoSource { virtual void Begin() { if (input_file_) fclose(input_file_); input_file_ = OpenTestDataFile(file_name_); - ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: " - << file_name_; + ASSERT_TRUE(input_file_ != NULL) + << "Input file open failed. Filename: " << file_name_; if (start_) { fseek(input_file_, static_cast<unsigned>(raw_size_) * start_, SEEK_SET); } diff --git a/libvpx/third_party/googletest/README.libvpx b/libvpx/third_party/googletest/README.libvpx index 3d9938096..2cd6910b4 100644 --- a/libvpx/third_party/googletest/README.libvpx +++ b/libvpx/third_party/googletest/README.libvpx @@ -20,3 +20,5 @@ Local Modifications: LICENSE README.md src +- Suppress unsigned overflow instrumentation in the LCG + https://github.com/google/googletest/pull/1066 diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h index 0094ed507..da57e65d3 100644 --- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h +++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h @@ -985,6 +985,19 @@ using ::std::tuple_size; # define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ #endif // __clang__ +// A function level attribute to disable UndefinedBehaviorSanitizer's (defined) +// unsigned integer overflow instrumentation. +#if defined(__clang__) +# if defined(__has_attribute) && __has_attribute(no_sanitize) +# define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_ \ + __attribute__((no_sanitize("unsigned-integer-overflow"))) +# else +# define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_ +# endif // defined(__has_attribute) && __has_attribute(no_sanitize) +#else +# define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_ +#endif // __clang__ + namespace testing { class Message; diff --git a/libvpx/third_party/googletest/src/src/gtest.cc b/libvpx/third_party/googletest/src/src/gtest.cc index d882ab2e3..5a8932c73 100644 --- a/libvpx/third_party/googletest/src/src/gtest.cc +++ b/libvpx/third_party/googletest/src/src/gtest.cc @@ -308,6 +308,7 @@ namespace internal { // Generates a random number from [0, range), using a Linear // Congruential Generator (LCG). Crashes if 'range' is 0 or greater // than kMaxRange. +GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_ UInt32 Random::Generate(UInt32 range) { // These constants are the same as are used in glibc's rand(3). state_ = (1103515245U*state_ + 12345U) % kMaxRange; diff --git a/libvpx/third_party/libwebm/README.libvpx b/libvpx/third_party/libwebm/README.libvpx index 1f8a13d78..ebb5ff2f4 100644 --- a/libvpx/third_party/libwebm/README.libvpx +++ b/libvpx/third_party/libwebm/README.libvpx @@ -1,5 +1,5 @@ URL: https://chromium.googlesource.com/webm/libwebm -Version: 9732ae991efb71aced4267d4794918279e362d99 +Version: 0ae757087f5e6eb01dfea16cc09205b2425cfb74 License: BSD License File: LICENSE.txt diff --git a/libvpx/third_party/libwebm/common/hdr_util.h b/libvpx/third_party/libwebm/common/hdr_util.h index 689fb30a3..3ef5388fd 100644 --- a/libvpx/third_party/libwebm/common/hdr_util.h +++ b/libvpx/third_party/libwebm/common/hdr_util.h @@ -47,7 +47,15 @@ struct Vp9CodecFeatures { int chroma_subsampling; }; +// disable deprecation warnings for auto_ptr +#if defined(__GNUC__) && __GNUC__ >= 5 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif typedef std::auto_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr; +#if defined(__GNUC__) && __GNUC__ >= 5 +#pragma GCC diagnostic pop +#endif bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc, PrimaryChromaticityPtr* muxer_pc); diff --git a/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc index 299b45c98..15b9a908d 100644 --- a/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc +++ b/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc @@ -24,6 +24,11 @@ #include "mkvmuxer/mkvwriter.h" #include "mkvparser/mkvparser.h" +// disable deprecation warnings for auto_ptr +#if defined(__GNUC__) && __GNUC__ >= 5 +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + namespace mkvmuxer { const float PrimaryChromaticity::kChromaticityMin = 0.0f; @@ -3053,7 +3058,7 @@ Segment::Segment() output_cues_(true), accurate_cluster_duration_(false), fixed_size_cluster_timecode_(false), - estimate_file_duration_(true), + estimate_file_duration_(false), payload_pos_(0), size_position_(0), doc_type_version_(kDefaultDocTypeVersion), @@ -3361,7 +3366,10 @@ uint64_t Segment::AddVideoTrack(int32_t width, int32_t height, int32_t number) { track->set_width(width); track->set_height(height); - tracks_.AddTrack(track, number); + if (!tracks_.AddTrack(track, number)) { + delete track; + return 0; + } has_video_ = true; return track->number(); @@ -3383,8 +3391,10 @@ bool Segment::AddCuePoint(uint64_t timestamp, uint64_t track) { cue->set_block_number(cluster->blocks_added()); cue->set_cluster_pos(cluster->position_for_cues()); cue->set_track(track); - if (!cues_.AddCue(cue)) + if (!cues_.AddCue(cue)) { + delete cue; return false; + } new_cuepoint_ = false; return true; @@ -3401,7 +3411,10 @@ uint64_t Segment::AddAudioTrack(int32_t sample_rate, int32_t channels, track->set_sample_rate(sample_rate); track->set_channels(channels); - tracks_.AddTrack(track, number); + if (!tracks_.AddTrack(track, number)) { + delete track; + return 0; + } return track->number(); } @@ -3490,16 +3503,33 @@ bool Segment::AddGenericFrame(const Frame* frame) { if (frame->discard_padding() != 0) doc_type_version_ = 4; + if (cluster_list_size_ > 0) { + const uint64_t timecode_scale = segment_info_.timecode_scale(); + const uint64_t frame_timecode = frame->timestamp() / timecode_scale; + + const Cluster* const last_cluster = cluster_list_[cluster_list_size_ - 1]; + const uint64_t last_cluster_timecode = last_cluster->timecode(); + + const uint64_t rel_timecode = frame_timecode - last_cluster_timecode; + if (rel_timecode > kMaxBlockTimecode) { + force_new_cluster_ = true; + } + } + // If the segment has a video track hold onto audio frames to make sure the // audio that is associated with the start time of a video key-frame is // muxed into the same cluster. if (has_video_ && tracks_.TrackIsAudio(frame->track_number()) && !force_new_cluster_) { Frame* const new_frame = new (std::nothrow) Frame(); - if (!new_frame || !new_frame->CopyFrom(*frame)) + if (!new_frame || !new_frame->CopyFrom(*frame)) { + delete new_frame; return false; - if (!QueueFrame(new_frame)) + } + if (!QueueFrame(new_frame)) { + delete new_frame; return false; + } track_frames_written_[frame->track_number() - 1]++; return true; } @@ -3522,8 +3552,10 @@ bool Segment::AddGenericFrame(const Frame* frame) { if (!frame->CanBeSimpleBlock() && !frame->is_key() && !frame->reference_block_timestamp_set()) { Frame* const new_frame = new (std::nothrow) Frame(); - if (!new_frame->CopyFrom(*frame)) + if (!new_frame || !new_frame->CopyFrom(*frame)) { + delete new_frame; return false; + } new_frame->set_reference_block_timestamp( last_track_timestamp_[frame->track_number() - 1]); frame = new_frame; diff --git a/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc index 1ba17ac1b..355d4e22b 100644 --- a/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc +++ b/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc @@ -10,6 +10,7 @@ #ifdef __ANDROID__ #include <fcntl.h> +#include <unistd.h> #endif #include <cassert> @@ -288,7 +289,7 @@ uint64 EbmlElementSize(uint64 type, const char* value) { ebml_size += strlen(value); // Size of Datasize - ebml_size++; + ebml_size += GetCodedUIntSize(strlen(value)); return ebml_size; } diff --git a/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc b/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc index ec34e4df8..84655d802 100644 --- a/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc +++ b/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc @@ -8,6 +8,8 @@ #include "mkvmuxer/mkvwriter.h" +#include <sys/types.h> + #ifdef _MSC_VER #include <share.h> // for _SH_DENYWR #endif diff --git a/libvpx/third_party/libwebm/mkvparser/mkvparser.cc b/libvpx/third_party/libwebm/mkvparser/mkvparser.cc index e62d6f607..37f230d0a 100644 --- a/libvpx/third_party/libwebm/mkvparser/mkvparser.cc +++ b/libvpx/third_party/libwebm/mkvparser/mkvparser.cc @@ -22,6 +22,11 @@ #include "common/webmids.h" +// disable deprecation warnings for auto_ptr +#if defined(__GNUC__) && __GNUC__ >= 5 +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + namespace mkvparser { const float MasteringMetadata::kValueNotPresent = FLT_MAX; const long long Colour::kValueNotPresent = LLONG_MAX; @@ -1528,15 +1533,19 @@ long SeekHead::Parse() { if (pos != stop) return E_FILE_FORMAT_INVALID; - m_entries = new (std::nothrow) Entry[entry_count]; + if (entry_count > 0) { + m_entries = new (std::nothrow) Entry[entry_count]; - if (m_entries == NULL) - return -1; + if (m_entries == NULL) + return -1; + } - m_void_elements = new (std::nothrow) VoidElement[void_element_count]; + if (void_element_count > 0) { + m_void_elements = new (std::nothrow) VoidElement[void_element_count]; - if (m_void_elements == NULL) - return -1; + if (m_void_elements == NULL) + return -1; + } // now parse the entries and void elements @@ -1555,14 +1564,14 @@ long SeekHead::Parse() { if (status < 0) // error return status; - if (id == libwebm::kMkvSeek) { + if (id == libwebm::kMkvSeek && entry_count > 0) { if (ParseEntry(pReader, pos, size, pEntry)) { Entry& e = *pEntry++; e.element_start = idpos; e.element_size = (pos + size) - idpos; } - } else if (id == libwebm::kMkvVoid) { + } else if (id == libwebm::kMkvVoid && void_element_count > 0) { VoidElement& e = *pVoidElement++; e.element_start = idpos; @@ -2426,7 +2435,9 @@ bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_, } const CuePoint::TrackPosition* CuePoint::Find(const Track* pTrack) const { - assert(pTrack); + if (pTrack == NULL) { + return NULL; + } const long long n = pTrack->GetNumber(); @@ -4026,7 +4037,7 @@ long SegmentInfo::Parse() { } const double rollover_check = m_duration * m_timecodeScale; - if (rollover_check > LLONG_MAX) + if (rollover_check > static_cast<double>(LLONG_MAX)) return E_FILE_FORMAT_INVALID; if (pos != stop) @@ -4975,29 +4986,27 @@ bool PrimaryChromaticity::Parse(IMkvReader* reader, long long read_pos, if (!reader) return false; - std::auto_ptr<PrimaryChromaticity> chromaticity_ptr; - - if (!*chromaticity) { - chromaticity_ptr.reset(new PrimaryChromaticity()); - } else { - chromaticity_ptr.reset(*chromaticity); - } + if (!*chromaticity) + *chromaticity = new PrimaryChromaticity(); - if (!chromaticity_ptr.get()) + if (!*chromaticity) return false; - float* value = is_x ? &chromaticity_ptr->x : &chromaticity_ptr->y; + PrimaryChromaticity* pc = *chromaticity; + float* value = is_x ? &pc->x : &pc->y; double parser_value = 0; - const long long value_parse_status = + const long long parse_status = UnserializeFloat(reader, read_pos, value_size, parser_value); - *value = static_cast<float>(parser_value); - - if (value_parse_status < 0 || *value < 0.0 || *value > 1.0) + // Valid range is [0, 1]. Make sure the double is representable as a float + // before casting. + if (parse_status < 0 || parser_value < 0.0 || parser_value > 1.0 || + (parser_value > 0.0 && parser_value < FLT_MIN)) return false; - *chromaticity = chromaticity_ptr.release(); + *value = static_cast<float>(parser_value); + return true; } @@ -5228,7 +5237,9 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size, double value = 0; const long long value_parse_status = UnserializeFloat(reader, read_pos, child_size, value); - if (value_parse_status < 0) { + // Make sure value is representable as a float before casting. + if (value_parse_status < 0 || value < -FLT_MAX || value > FLT_MAX || + (value > 0.0 && value < FLT_MIN)) { return false; } @@ -7932,7 +7943,6 @@ long Block::Parse(const Cluster* pCluster) { pf = m_frames; while (pf != pf_end) { Frame& f = *pf++; - assert((pos + f.len) <= stop); if ((pos + f.len) > stop) return E_FILE_FORMAT_INVALID; diff --git a/libvpx/third_party/libwebm/mkvparser/mkvreader.cc b/libvpx/third_party/libwebm/mkvparser/mkvreader.cc index b8fd00c26..23d68f508 100644 --- a/libvpx/third_party/libwebm/mkvparser/mkvreader.cc +++ b/libvpx/third_party/libwebm/mkvparser/mkvreader.cc @@ -7,6 +7,8 @@ // be found in the AUTHORS file in the root of the source tree. #include "mkvparser/mkvreader.h" +#include <sys/types.h> + #include <cassert> namespace mkvparser { diff --git a/libvpx/tools.mk b/libvpx/tools.mk index 23adcee6e..1d005b2ac 100644 --- a/libvpx/tools.mk +++ b/libvpx/tools.mk @@ -13,6 +13,8 @@ TOOLS-yes += tiny_ssim.c tiny_ssim.SRCS += vpx/vpx_integer.h y4minput.c y4minput.h \ vpx/vpx_codec.h vpx/src/vpx_image.c tiny_ssim.SRCS += vpx_mem/vpx_mem.c vpx_mem/vpx_mem.h +tiny_ssim.SRCS += vpx_dsp/ssim.h vpx_scale/yv12config.h +tiny_ssim.SRCS += vpx_ports/mem.h vpx_ports/mem.h tiny_ssim.SRCS += vpx_mem/include/vpx_mem_intrnl.h tiny_ssim.GUID = 3afa9b05-940b-4d68-b5aa-55157d8ed7b4 tiny_ssim.DESCRIPTION = Generate SSIM/PSNR from raw .yuv files diff --git a/libvpx/tools/all_builds.py b/libvpx/tools/all_builds.py deleted file mode 100755 index d1f0c80c0..000000000 --- a/libvpx/tools/all_builds.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/python - -import getopt -import subprocess -import sys - -LONG_OPTIONS = ["shard=", "shards="] -BASE_COMMAND = "./configure --enable-internal-stats --enable-experimental" - -def RunCommand(command): - run = subprocess.Popen(command, shell=True) - output = run.communicate() - if run.returncode: - print "Non-zero return code: " + str(run.returncode) + " => exiting!" - sys.exit(1) - -def list_of_experiments(): - experiments = [] - configure_file = open("configure") - list_start = False - for line in configure_file.read().split("\n"): - if line == 'EXPERIMENT_LIST="': - list_start = True - elif line == '"': - list_start = False - elif list_start: - currently_broken = ["csm"] - experiment = line[4:] - if experiment not in currently_broken: - experiments.append(experiment) - return experiments - -def main(argv): - # Parse arguments - options = {"--shard": 0, "--shards": 1} - if "--" in argv: - opt_end_index = argv.index("--") - else: - opt_end_index = len(argv) - try: - o, _ = getopt.getopt(argv[1:opt_end_index], None, LONG_OPTIONS) - except getopt.GetoptError, err: - print str(err) - print "Usage: %s [--shard=<n> --shards=<n>] -- [configure flag ...]"%argv[0] - sys.exit(2) - - options.update(o) - extra_args = argv[opt_end_index + 1:] - - # Shard experiment list - shard = int(options["--shard"]) - shards = int(options["--shards"]) - experiments = list_of_experiments() - base_command = " ".join([BASE_COMMAND] + extra_args) - configs = [base_command] - configs += ["%s --enable-%s" % (base_command, e) for e in experiments] - my_configs = zip(configs, range(len(configs))) - my_configs = filter(lambda x: x[1] % shards == shard, my_configs) - my_configs = [e[0] for e in my_configs] - - # Run configs for this shard - for config in my_configs: - test_build(config) - -def test_build(configure_command): - print "\033[34m\033[47mTesting %s\033[0m" % (configure_command) - RunCommand(configure_command) - RunCommand("make clean") - RunCommand("make") - -if __name__ == "__main__": - main(sys.argv) diff --git a/libvpx/tools/author_first_release.sh b/libvpx/tools/author_first_release.sh deleted file mode 100755 index 7b0b79721..000000000 --- a/libvpx/tools/author_first_release.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -## -## List the release each author first contributed to. -## -## Usage: author_first_release.sh [TAGS] -## -## If the TAGS arguments are unspecified, all tags reported by `git tag` -## will be considered. -## -tags=${@:-$(git tag)} -for tag in $tags; do - git shortlog -n -e -s $tag | - cut -f2- | - awk "{print \"${tag#v}\t\"\$0}" -done | sort -k2 | uniq -f2 diff --git a/libvpx/tools/ftfy.sh b/libvpx/tools/ftfy.sh deleted file mode 100755 index c005918fe..000000000 --- a/libvpx/tools/ftfy.sh +++ /dev/null @@ -1,158 +0,0 @@ -#!/bin/sh -self="$0" -dirname_self=$(dirname "$self") - -usage() { - cat <<EOF >&2 -Usage: $self [option] - -This script applies a whitespace transformation to the commit at HEAD. If no -options are given, then the modified files are left in the working tree. - -Options: - -h, --help Shows this message - -n, --dry-run Shows a diff of the changes to be made. - --amend Squashes the changes into the commit at HEAD - This option will also reformat the commit message. - --commit Creates a new commit containing only the whitespace changes - --msg-only Reformat the commit message only, ignore the patch itself. - -EOF - rm -f ${CLEAN_FILES} - exit 1 -} - - -log() { - echo "${self##*/}: $@" >&2 -} - - -vpx_style() { - for f; do - case "$f" in - *.h|*.c|*.cc) - clang-format -i --style=file "$f" - ;; - esac - done -} - - -apply() { - [ $INTERSECT_RESULT -ne 0 ] && patch -p1 < "$1" -} - - -commit() { - LAST_CHANGEID=$(git show | awk '/Change-Id:/{print $2}') - if [ -z "$LAST_CHANGEID" ]; then - log "HEAD doesn't have a Change-Id, unable to generate a new commit" - exit 1 - fi - - # Build a deterministic Change-Id from the parent's - NEW_CHANGEID=${LAST_CHANGEID}-styled - NEW_CHANGEID=I$(echo $NEW_CHANGEID | git hash-object --stdin) - - # Commit, preserving authorship from the parent commit. - git commit -a -C HEAD > /dev/null - git commit --amend -F- << EOF -Cosmetic: Fix whitespace in change ${LAST_CHANGEID:0:9} - -Change-Id: ${NEW_CHANGEID} -EOF -} - - -show_commit_msg_diff() { - if [ $DIFF_MSG_RESULT -ne 0 ]; then - log "Modified commit message:" - diff -u "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG" | tail -n +3 - fi -} - - -amend() { - show_commit_msg_diff - if [ $DIFF_MSG_RESULT -ne 0 ] || [ $INTERSECT_RESULT -ne 0 ]; then - git commit -a --amend -F "$NEW_COMMIT_MSG" - fi -} - - -diff_msg() { - git log -1 --format=%B > "$ORIG_COMMIT_MSG" - "${dirname_self}"/wrap-commit-msg.py \ - < "$ORIG_COMMIT_MSG" > "$NEW_COMMIT_MSG" - cmp -s "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG" - DIFF_MSG_RESULT=$? -} - - -# Temporary files -ORIG_DIFF=orig.diff.$$ -MODIFIED_DIFF=modified.diff.$$ -FINAL_DIFF=final.diff.$$ -ORIG_COMMIT_MSG=orig.commit-msg.$$ -NEW_COMMIT_MSG=new.commit-msg.$$ -CLEAN_FILES="${ORIG_DIFF} ${MODIFIED_DIFF} ${FINAL_DIFF}" -CLEAN_FILES="${CLEAN_FILES} ${ORIG_COMMIT_MSG} ${NEW_COMMIT_MSG}" - -# Preconditions -[ $# -lt 2 ] || usage - -if ! clang-format -version >/dev/null 2>&1; then - log "clang-format not found" - exit 1 -fi - -if ! git diff --quiet HEAD; then - log "Working tree is dirty, commit your changes first" - exit 1 -fi - -# Need to be in the root -cd "$(git rev-parse --show-toplevel)" - -# Collect the original diff -git show > "${ORIG_DIFF}" - -# Apply the style guide on new and modified files and collect its diff -for f in $(git diff HEAD^ --name-only -M90 --diff-filter=AM); do - case "$f" in - third_party/*) continue;; - esac - vpx_style "$f" -done -git diff --no-color --no-ext-diff > "${MODIFIED_DIFF}" - -# Intersect the two diffs -"${dirname_self}"/intersect-diffs.py \ - "${ORIG_DIFF}" "${MODIFIED_DIFF}" > "${FINAL_DIFF}" -INTERSECT_RESULT=$? -git reset --hard >/dev/null - -# Fixup the commit message -diff_msg - -# Handle options -if [ -n "$1" ]; then - case "$1" in - -h|--help) usage;; - -n|--dry-run) cat "${FINAL_DIFF}"; show_commit_msg_diff;; - --commit) apply "${FINAL_DIFF}"; commit;; - --amend) apply "${FINAL_DIFF}"; amend;; - --msg-only) amend;; - *) usage;; - esac -else - apply "${FINAL_DIFF}" - if ! git diff --quiet; then - log "Formatting changes applied, verify and commit." - log "See also: http://www.webmproject.org/code/contribute/conventions/" - git diff --stat - fi -fi - -rm -f ${CLEAN_FILES} diff --git a/libvpx/tools/tiny_ssim.c b/libvpx/tools/tiny_ssim.c index 1f6a448bc..5e8ca02b4 100644 --- a/libvpx/tools/tiny_ssim.c +++ b/libvpx/tools/tiny_ssim.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <assert.h> #include <errno.h> #include <math.h> #include <stdio.h> @@ -16,73 +17,36 @@ #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" #include "./y4minput.h" +#include "vpx_dsp/ssim.h" +#include "vpx_ports/mem.h" + +static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 +static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 +static const int64_t cc1_10 = 428658; // (64^2*(.01*1023)^2 +static const int64_t cc2_10 = 3857925; // (64^2*(.03*1023)^2 +static const int64_t cc1_12 = 6868593; // (64^2*(.01*4095)^2 +static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2 + +#if CONFIG_VP9_HIGHBITDEPTH +static uint64_t calc_plane_error16(uint16_t *orig, int orig_stride, + uint16_t *recon, int recon_stride, + unsigned int cols, unsigned int rows) { + unsigned int row, col; + uint64_t total_sse = 0; + int diff; -void vp8_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp, - uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, - uint32_t *sum_sq_r, uint32_t *sum_sxr) { - int i, j; - for (i = 0; i < 8; i++, s += sp, r += rp) { - for (j = 0; j < 8; j++) { - *sum_s += s[j]; - *sum_r += r[j]; - *sum_sq_s += s[j] * s[j]; - *sum_sq_r += r[j] * r[j]; - *sum_sxr += s[j] * r[j]; + for (row = 0; row < rows; row++) { + for (col = 0; col < cols; col++) { + diff = orig[col] - recon[col]; + total_sse += diff * diff; } - } -} - -static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 -static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 -static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, - uint32_t sum_sq_r, uint32_t sum_sxr, int count) { - int64_t ssim_n, ssim_d; - int64_t c1, c2; - - // scale the constants by number of pixels - c1 = (cc1 * count * count) >> 12; - c2 = (cc2 * count * count) >> 12; - - ssim_n = (2 * sum_s * sum_r + c1) * - ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2); - - ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) * - ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s + - (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2); - - return ssim_n * 1.0 / ssim_d; -} - -static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp) { - uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; - vp8_ssim_parms_8x8_c(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, - &sum_sxr); - return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64); -} - -// We are using a 8x8 moving window with starting location of each 8x8 window -// on the 4x4 pixel grid. Such arrangement allows the windows to overlap -// block boundaries to penalize blocking artifacts. -double vp8_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1, - int stride_img2, int width, int height) { - int i, j; - int samples = 0; - double ssim_total = 0; - - // sample point start with each 4x4 location - for (i = 0; i <= height - 8; - i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { - for (j = 0; j <= width - 8; j += 4) { - double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2); - ssim_total += v; - samples++; - } + orig += orig_stride; + recon += recon_stride; } - ssim_total /= samples; - return ssim_total; + return total_sse; } - +#endif static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon, int recon_stride, unsigned int cols, unsigned int rows) { @@ -103,7 +67,7 @@ static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon, } #define MAX_PSNR 100 -double vp9_mse2psnr(double samples, double peak, double mse) { +static double mse2psnr(double samples, double peak, double mse) { double psnr; if (mse > 0.0) @@ -126,10 +90,12 @@ typedef struct input_file { vpx_image_t img; int w; int h; + int bit_depth; } input_file_t; // Open a file and determine if its y4m or raw. If y4m get the header. -int open_input_file(const char *file_name, input_file_t *input, int w, int h) { +static int open_input_file(const char *file_name, input_file_t *input, int w, + int h, int bit_depth) { char y4m_buf[4]; size_t r1; input->type = RAW_YUV; @@ -144,6 +110,7 @@ int open_input_file(const char *file_name, input_file_t *input, int w, int h) { y4m_input_open(&input->y4m, input->file, y4m_buf, 4, 0); input->w = input->y4m.pic_w; input->h = input->y4m.pic_h; + input->bit_depth = input->y4m.bit_depth; // Y4M alloc's its own buf. Init this to avoid problems if we never // read frames. memset(&input->img, 0, sizeof(input->img)); @@ -152,14 +119,17 @@ int open_input_file(const char *file_name, input_file_t *input, int w, int h) { fseek(input->file, 0, SEEK_SET); input->w = w; input->h = h; - input->buf = malloc(w * h * 3 / 2); + if (bit_depth < 9) + input->buf = malloc(w * h * 3 / 2); + else + input->buf = malloc(w * h * 3); break; } } return 0; } -void close_input_file(input_file_t *in) { +static void close_input_file(input_file_t *in) { if (in->file) fclose(in->file); if (in->type == Y4M) { vpx_img_free(&in->img); @@ -168,8 +138,8 @@ void close_input_file(input_file_t *in) { } } -size_t read_input_file(input_file_t *in, unsigned char **y, unsigned char **u, - unsigned char **v) { +static size_t read_input_file(input_file_t *in, unsigned char **y, + unsigned char **u, unsigned char **v, int bd) { size_t r1 = 0; switch (in->type) { case Y4M: @@ -179,18 +149,429 @@ size_t read_input_file(input_file_t *in, unsigned char **y, unsigned char **u, *v = in->img.planes[2]; break; case RAW_YUV: - r1 = fread(in->buf, in->w * in->h * 3 / 2, 1, in->file); - *y = in->buf; - *u = in->buf + in->w * in->h; - *v = in->buf + 5 * in->w * in->h / 4; + if (bd < 9) { + r1 = fread(in->buf, in->w * in->h * 3 / 2, 1, in->file); + *y = in->buf; + *u = in->buf + in->w * in->h; + *v = in->buf + 5 * in->w * in->h / 4; + } else { + r1 = fread(in->buf, in->w * in->h * 3, 1, in->file); + *y = in->buf; + *u = in->buf + in->w * in->h / 2; + *v = *u + in->w * in->h / 2; + } break; } return r1; } +void ssim_parms_16x16(const uint8_t *s, int sp, const uint8_t *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, + uint32_t *sum_sq_r, uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 16; i++, s += sp, r += rp) { + for (j = 0; j < 16; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} +void ssim_parms_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, + uint32_t *sum_sq_r, uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 8; i++, s += sp, r += rp) { + for (j = 0; j < 8; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} + +void highbd_ssim_parms_8x8(const uint16_t *s, int sp, const uint16_t *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, + uint32_t *sum_sq_r, uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 8; i++, s += sp, r += rp) { + for (j = 0; j < 8; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} + +static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, + uint32_t sum_sq_r, uint32_t sum_sxr, int count, + uint32_t bd) { + int64_t ssim_n, ssim_d; + int64_t c1 = 0, c2 = 0; + if (bd == 8) { + // scale the constants by number of pixels + c1 = (cc1 * count * count) >> 12; + c2 = (cc2 * count * count) >> 12; + } else if (bd == 10) { + c1 = (cc1_10 * count * count) >> 12; + c2 = (cc2_10 * count * count) >> 12; + } else if (bd == 12) { + c1 = (cc1_12 * count * count) >> 12; + c2 = (cc2_12 * count * count) >> 12; + } else { + assert(0); + } + + ssim_n = (2 * sum_s * sum_r + c1) * + ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2); + + ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) * + ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s + + (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2); + + return ssim_n * 1.0 / ssim_d; +} + +static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) { + uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; + ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8); +} + +static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r, + int rp, uint32_t bd, uint32_t shift) { + uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; + highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, + &sum_sxr); + return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift), + sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd); +} + +// We are using a 8x8 moving window with starting location of each 8x8 window +// on the 4x4 pixel grid. Such arrangement allows the windows to overlap +// block boundaries to penalize blocking artifacts. +static double ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1, + int stride_img2, int width, int height) { + int i, j; + int samples = 0; + double ssim_total = 0; + + // sample point start with each 4x4 location + for (i = 0; i <= height - 8; + i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { + for (j = 0; j <= width - 8; j += 4) { + double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2); + ssim_total += v; + samples++; + } + } + ssim_total /= samples; + return ssim_total; +} + +static double highbd_ssim2(const uint8_t *img1, const uint8_t *img2, + int stride_img1, int stride_img2, int width, + int height, uint32_t bd, uint32_t shift) { + int i, j; + int samples = 0; + double ssim_total = 0; + + // sample point start with each 4x4 location + for (i = 0; i <= height - 8; + i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { + for (j = 0; j <= width - 8; j += 4) { + double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1, + CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd, + shift); + ssim_total += v; + samples++; + } + } + ssim_total /= samples; + return ssim_total; +} + +// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity +// +// Re working out the math -> +// +// ssim(x,y) = (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) / +// ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2)) +// +// mean(x) = sum(x) / n +// +// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n) +// +// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n) +// +// ssim(x,y) = +// (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) / +// (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) * +// ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+ +// (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2))) +// +// factoring out n*n +// +// ssim(x,y) = +// (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) / +// (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) * +// (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2)) +// +// Replace c1 with n*n * c1 for the final step that leads to this code: +// The final step scales by 12 bits so we don't lose precision in the constants. + +static double ssimv_similarity(const Ssimv *sv, int64_t n) { + // Scale the constants by number of pixels. + const int64_t c1 = (cc1 * n * n) >> 12; + const int64_t c2 = (cc2 * n * n) >> 12; + + const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) / + (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1); + + // Since these variables are unsigned sums, convert to double so + // math is done in double arithmetic. + const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / + (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + + n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); + + return l * v; +} + +// The first term of the ssim metric is a luminance factor. +// +// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1) +// +// This luminance factor is super sensitive to the dark side of luminance +// values and completely insensitive on the white side. check out 2 sets +// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60 +// 2*250*252/ (250^2+252^2) => .99999997 +// +// As a result in this tweaked version of the calculation in which the +// luminance is taken as percentage off from peak possible. +// +// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count +// +static double ssimv_similarity2(const Ssimv *sv, int64_t n) { + // Scale the constants by number of pixels. + const int64_t c1 = (cc1 * n * n) >> 12; + const int64_t c2 = (cc2 * n * n) >> 12; + + const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n; + const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1); + + // Since these variables are unsigned, sums convert to double so + // math is done in double arithmetic. + const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / + (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + + n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); + + return l * v; +} +static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, Ssimv *sv) { + ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r, + &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr); +} + +double get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, int width, int height, Ssimv *sv2, + Metrics *m, int do_inconsistency) { + double dssim_total = 0; + double ssim_total = 0; + double ssim2_total = 0; + double inconsistency_total = 0; + int i, j; + int c = 0; + double norm; + double old_ssim_total = 0; + + // We can sample points as frequently as we like start with 1 per 4x4. + for (i = 0; i < height; + i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { + for (j = 0; j < width; j += 4, ++c) { + Ssimv sv = { 0, 0, 0, 0, 0, 0 }; + double ssim; + double ssim2; + double dssim; + uint32_t var_new; + uint32_t var_old; + uint32_t mean_new; + uint32_t mean_old; + double ssim_new; + double ssim_old; + + // Not sure there's a great way to handle the edge pixels + // in ssim when using a window. Seems biased against edge pixels + // however you handle this. This uses only samples that are + // fully in the frame. + if (j + 8 <= width && i + 8 <= height) { + ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv); + } + + ssim = ssimv_similarity(&sv, 64); + ssim2 = ssimv_similarity2(&sv, 64); + + sv.ssim = ssim2; + + // dssim is calculated to use as an actual error metric and + // is scaled up to the same range as sum square error. + // Since we are subsampling every 16th point maybe this should be + // *16 ? + dssim = 255 * 255 * (1 - ssim2) / 2; + + // Here I introduce a new error metric: consistency-weighted + // SSIM-inconsistency. This metric isolates frames where the + // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much + // sharper or blurrier than the others. Higher values indicate a + // temporally inconsistent SSIM. There are two ideas at work: + // + // 1) 'SSIM-inconsistency': the total inconsistency value + // reflects how much SSIM values are changing between this + // source / reference frame pair and the previous pair. + // + // 2) 'consistency-weighted': weights de-emphasize areas in the + // frame where the scene content has changed. Changes in scene + // content are detected via changes in local variance and local + // mean. + // + // Thus the overall measure reflects how inconsistent the SSIM + // values are, over consistent regions of the frame. + // + // The metric has three terms: + // + // term 1 -> uses change in scene Variance to weight error score + // 2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2) + // larger changes from one frame to the next mean we care + // less about consistency. + // + // term 2 -> uses change in local scene luminance to weight error + // 2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2) + // larger changes from one frame to the next mean we care + // less about consistency. + // + // term3 -> measures inconsistency in ssim scores between frames + // 1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2). + // + // This term compares the ssim score for the same location in 2 + // subsequent frames. + var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64; + var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64; + mean_new = sv.sum_s; + mean_old = sv2[c].sum_s; + ssim_new = sv.ssim; + ssim_old = sv2[c].ssim; + + if (do_inconsistency) { + // We do the metric once for every 4x4 block in the image. Since + // we are scaling the error to SSE for use in a psnr calculation + // 1.0 = 4x4x255x255 the worst error we can possibly have. + static const double kScaling = 4. * 4 * 255 * 255; + + // The constants have to be non 0 to avoid potential divide by 0 + // issues other than that they affect kind of a weighting between + // the terms. No testing of what the right terms should be has been + // done. + static const double c1 = 1, c2 = 1, c3 = 1; + + // This measures how much consistent variance is in two consecutive + // source frames. 1.0 means they have exactly the same variance. + const double variance_term = + (2.0 * var_old * var_new + c1) / + (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1); + + // This measures how consistent the local mean are between two + // consecutive frames. 1.0 means they have exactly the same mean. + const double mean_term = + (2.0 * mean_old * mean_new + c2) / + (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2); + + // This measures how consistent the ssims of two + // consecutive frames is. 1.0 means they are exactly the same. + double ssim_term = + pow((2.0 * ssim_old * ssim_new + c3) / + (ssim_old * ssim_old + ssim_new * ssim_new + c3), + 5); + + double this_inconsistency; + + // Floating point math sometimes makes this > 1 by a tiny bit. + // We want the metric to scale between 0 and 1.0 so we can convert + // it to an snr scaled value. + if (ssim_term > 1) ssim_term = 1; + + // This converts the consistency metric to an inconsistency metric + // ( so we can scale it like psnr to something like sum square error. + // The reason for the variance and mean terms is the assumption that + // if there are big changes in the source we shouldn't penalize + // inconsistency in ssim scores a bit less as it will be less visible + // to the user. + this_inconsistency = (1 - ssim_term) * variance_term * mean_term; + + this_inconsistency *= kScaling; + inconsistency_total += this_inconsistency; + } + sv2[c] = sv; + ssim_total += ssim; + ssim2_total += ssim2; + dssim_total += dssim; + + old_ssim_total += ssim_old; + } + old_ssim_total += 0; + } + + norm = 1. / (width / 4) / (height / 4); + ssim_total *= norm; + ssim2_total *= norm; + m->ssim2 = ssim2_total; + m->ssim = ssim_total; + if (old_ssim_total == 0) inconsistency_total = 0; + + m->ssimc = inconsistency_total; + + m->dssim = dssim_total; + return inconsistency_total; +} + +double highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight, + uint32_t bd, uint32_t in_bd) { + double a, b, c; + double ssimv; + uint32_t shift = 0; + + assert(bd >= in_bd); + shift = bd - in_bd; + + a = highbd_ssim2(source->y_buffer, dest->y_buffer, source->y_stride, + dest->y_stride, source->y_crop_width, source->y_crop_height, + in_bd, shift); + + b = highbd_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, shift); + + c = highbd_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, shift); + + ssimv = a * .8 + .1 * (b + c); + + *weight = 1; + + return ssimv; +} + int main(int argc, char *argv[]) { FILE *framestats = NULL; + int bit_depth = 8; int w = 0, h = 0, tl_skip = 0, tl_skips_remaining = 0; double ssimavg = 0, ssimyavg = 0, ssimuavg = 0, ssimvavg = 0; double psnrglb = 0, psnryglb = 0, psnruglb = 0, psnrvglb = 0; @@ -200,11 +581,12 @@ int main(int argc, char *argv[]) { size_t i, n_frames = 0, allocated_frames = 0; int return_value = 0; input_file_t in[2]; + double peak = 255.0; if (argc < 2) { fprintf(stderr, "Usage: %s file1.{yuv|y4m} file2.{yuv|y4m}" - "[WxH tl_skip={0,1,3}]\n", + "[WxH tl_skip={0,1,3} frame_stats_file bits]\n", argv[0]); return_value = 1; goto clean_up; @@ -214,7 +596,11 @@ int main(int argc, char *argv[]) { sscanf(argv[3], "%dx%d", &w, &h); } - if (open_input_file(argv[1], &in[0], w, h) < 0) { + if (argc > 6) { + sscanf(argv[6], "%d", &bit_depth); + } + + if (open_input_file(argv[1], &in[0], w, h, bit_depth) < 0) { fprintf(stderr, "File %s can't be opened or parsed!\n", argv[2]); goto clean_up; } @@ -223,9 +609,13 @@ int main(int argc, char *argv[]) { // If a y4m is the first file and w, h is not set grab from first file. w = in[0].w; h = in[0].h; + bit_depth = in[0].bit_depth; } + if (bit_depth == 10) peak = 1023.0; - if (open_input_file(argv[2], &in[1], w, h) < 0) { + if (bit_depth == 12) peak = 4095; + + if (open_input_file(argv[2], &in[1], w, h, bit_depth) < 0) { fprintf(stderr, "File %s can't be opened or parsed!\n", argv[2]); goto clean_up; } @@ -264,7 +654,7 @@ int main(int argc, char *argv[]) { size_t r1, r2; unsigned char *y[2], *u[2], *v[2]; - r1 = read_input_file(&in[0], &y[0], &u[0], &v[0]); + r1 = read_input_file(&in[0], &y[0], &u[0], &v[0], bit_depth); if (r1) { // Reading parts of file1.yuv that were not used in temporal layer. @@ -276,7 +666,7 @@ int main(int argc, char *argv[]) { tl_skips_remaining = tl_skip; } - r2 = read_input_file(&in[1], &y[1], &u[1], &v[1]); + r2 = read_input_file(&in[1], &y[1], &u[1], &v[1], bit_depth); if (r1 && r2 && r1 != r2) { fprintf(stderr, "Failed to read data: %s [%d/%d]\n", strerror(errno), @@ -286,9 +676,22 @@ int main(int argc, char *argv[]) { } else if (r1 == 0 || r2 == 0) { break; } +#if CONFIG_VP9_HIGHBITDEPTH +#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \ + if (bit_depth < 9) { \ + ssim = ssim2(buf0, buf1, w, w, w, h); \ + psnr = calc_plane_error(buf0, w, buf1, w, w, h); \ + } else { \ + ssim = highbd_ssim2(CONVERT_TO_BYTEPTR(buf0), CONVERT_TO_BYTEPTR(buf1), w, \ + w, w, h, bit_depth, bit_depth - 8); \ + psnr = calc_plane_error16(CAST_TO_SHORTPTR(buf0), w, \ + CAST_TO_SHORTPTR(buf1), w, w, h); \ + } +#else #define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \ - ssim = vp8_ssim2(buf0, buf1, w, w, w, h); \ + ssim = ssim2(buf0, buf1, w, w, w, h); \ psnr = calc_plane_error(buf0, w, buf1, w, w, h); +#endif if (n_frames == allocated_frames) { allocated_frames = allocated_frames == 0 ? 1024 : allocated_frames * 2; @@ -321,11 +724,11 @@ int main(int argc, char *argv[]) { ssimuavg += ssimu[i]; ssimvavg += ssimv[i]; - frame_psnr = vp9_mse2psnr(w * h * 6 / 4, 255.0, - (double)psnry[i] + psnru[i] + psnrv[i]); - frame_psnry = vp9_mse2psnr(w * h * 4 / 4, 255.0, (double)psnry[i]); - frame_psnru = vp9_mse2psnr(w * h * 1 / 4, 255.0, (double)psnru[i]); - frame_psnrv = vp9_mse2psnr(w * h * 1 / 4, 255.0, (double)psnrv[i]); + frame_psnr = + mse2psnr(w * h * 6 / 4, peak, (double)psnry[i] + psnru[i] + psnrv[i]); + frame_psnry = mse2psnr(w * h * 4 / 4, peak, (double)psnry[i]); + frame_psnru = mse2psnr(w * h * 1 / 4, peak, (double)psnru[i]); + frame_psnrv = mse2psnr(w * h * 1 / 4, peak, (double)psnrv[i]); psnravg += frame_psnr; psnryavg += frame_psnry; @@ -367,10 +770,10 @@ int main(int argc, char *argv[]) { puts(""); psnrglb = psnryglb + psnruglb + psnrvglb; - psnrglb = vp9_mse2psnr((double)n_frames * w * h * 6 / 4, 255.0, psnrglb); - psnryglb = vp9_mse2psnr((double)n_frames * w * h * 4 / 4, 255.0, psnryglb); - psnruglb = vp9_mse2psnr((double)n_frames * w * h * 1 / 4, 255.0, psnruglb); - psnrvglb = vp9_mse2psnr((double)n_frames * w * h * 1 / 4, 255.0, psnrvglb); + psnrglb = mse2psnr((double)n_frames * w * h * 6 / 4, peak, psnrglb); + psnryglb = mse2psnr((double)n_frames * w * h * 4 / 4, peak, psnryglb); + psnruglb = mse2psnr((double)n_frames * w * h * 1 / 4, peak, psnruglb); + psnrvglb = mse2psnr((double)n_frames * w * h * 1 / 4, peak, psnrvglb); printf("GlbPSNR: %lf\n", psnrglb); printf("GlbPSNR-Y: %lf\n", psnryglb); diff --git a/libvpx/vp8/common/blockd.h b/libvpx/vp8/common/blockd.h index 74fc5d6db..1a3aad16a 100644 --- a/libvpx/vp8/common/blockd.h +++ b/libvpx/vp8/common/blockd.h @@ -169,6 +169,11 @@ typedef struct { typedef struct { FRAME_TYPE frame_type; int is_frame_dropped; + // If frame is dropped due to overshoot after encode_frame. This triggers a + // drop and resets rate control with Q forced to max for following frame. + // The check for this dropping due to overshoot is only done on lowest stream, + // and if set will force drop on all spatial streams for that current frame. + int is_frame_dropped_overshoot_maxqp; // The frame rate for the lowest resolution. double low_res_framerate; /* The frame number of each reference frames */ diff --git a/libvpx/vp8/common/loopfilter_filters.c b/libvpx/vp8/common/loopfilter_filters.c index 2a7cde878..188e290ca 100644 --- a/libvpx/vp8/common/loopfilter_filters.c +++ b/libvpx/vp8/common/loopfilter_filters.c @@ -86,10 +86,12 @@ static void vp8_filter(signed char mask, uc hev, uc *op1, uc *op0, uc *oq0, u = vp8_signed_char_clamp(ps1 + filter_value); *op1 = u ^ 0x80; } -void vp8_loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */ - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, int count) { + +static void loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */ + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count) { int hev = 0; /* high edge variance */ signed char mask = 0; int i = 0; @@ -109,10 +111,11 @@ void vp8_loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */ } while (++i < count * 8); } -void vp8_loop_filter_vertical_edge_c(unsigned char *s, int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, int count) { +static void loop_filter_vertical_edge_c(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count) { int hev = 0; /* high edge variance */ signed char mask = 0; int i = 0; @@ -185,11 +188,11 @@ static void vp8_mbfilter(signed char mask, uc hev, uc *op2, uc *op1, uc *op0, *op2 = s ^ 0x80; } -void vp8_mbloop_filter_horizontal_edge_c(unsigned char *s, int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, - int count) { +static void mbloop_filter_horizontal_edge_c(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count) { signed char hev = 0; /* high edge variance */ signed char mask = 0; int i = 0; @@ -210,10 +213,11 @@ void vp8_mbloop_filter_horizontal_edge_c(unsigned char *s, int p, } while (++i < count * 8); } -void vp8_mbloop_filter_vertical_edge_c(unsigned char *s, int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, int count) { +static void mbloop_filter_vertical_edge_c(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count) { signed char hev = 0; /* high edge variance */ signed char mask = 0; int i = 0; @@ -295,17 +299,17 @@ void vp8_loop_filter_simple_vertical_edge_c(unsigned char *s, int p, void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 2); + mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); if (u_ptr) { - vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); } if (v_ptr) { - vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); } } @@ -313,17 +317,17 @@ void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 2); + mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); if (u_ptr) { - vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); } if (v_ptr) { - vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); } } @@ -331,21 +335,21 @@ void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, - lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, - lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, - lfi->lim, lfi->hev_thr, 2); + loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); + loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); + loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); if (u_ptr) { - vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 1); + loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 1); } if (v_ptr) { - vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 1); + loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 1); } } @@ -363,21 +367,21 @@ void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 2); - vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 2); - vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 2); + loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); if (u_ptr) { - vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 1); + loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 1); } if (v_ptr) { - vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 1); + loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 1); } } diff --git a/libvpx/vp8/common/mfqe.c b/libvpx/vp8/common/mfqe.c index 5aace8c99..b6f8146b8 100644 --- a/libvpx/vp8/common/mfqe.c +++ b/libvpx/vp8/common/mfqe.c @@ -74,8 +74,7 @@ static void apply_ifactor(unsigned char *y_src, int y_src_stride, src_weight); vp8_filter_by_weight8x8(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight); - } else /* if (block_size == 8) */ - { + } else { vp8_filter_by_weight8x8(y_src, y_src_stride, y_dst, y_dst_stride, src_weight); vp8_filter_by_weight4x4(u_src, uv_src_stride, u_dst, uv_dst_stride, @@ -136,8 +135,7 @@ static void multiframe_quality_enhance_block( usad = (vpx_sad8x8(u, uv_stride, ud, uvd_stride) + 32) >> 6; vsad = (vpx_sad8x8(v, uv_stride, vd, uvd_stride) + 32) >> 6; #endif - } else /* if (blksize == 8) */ - { + } else { actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse) + 32) >> 6; act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse) + 32) >> 6; #ifdef USE_SSD @@ -186,14 +184,12 @@ static void multiframe_quality_enhance_block( apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd, uvd_stride, blksize, ifactor); } - } else /* else implicitly copy from previous frame */ - { + } else { /* else implicitly copy from previous frame */ if (blksize == 16) { vp8_copy_mem16x16(y, y_stride, yd, yd_stride); vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride); vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride); - } else /* if (blksize == 8) */ - { + } else { vp8_copy_mem8x8(y, y_stride, yd, yd_stride); for (up = u, udp = ud, i = 0; i < uvblksize; ++i, up += uv_stride, udp += uvd_stride) { @@ -297,8 +293,7 @@ void vp8_multiframe_quality_enhance(VP8_COMMON *cm) { } } } - } else /* totmap = 4 */ - { + } else { /* totmap = 4 */ multiframe_quality_enhance_block( 16, qcurr, qprev, y_ptr, u_ptr, v_ptr, show->y_stride, show->uv_stride, yd_ptr, ud_ptr, vd_ptr, dest->y_stride, diff --git a/libvpx/vp8/common/mips/dspr2/filter_dspr2.c b/libvpx/vp8/common/mips/dspr2/filter_dspr2.c index 2de343419..e46827b0e 100644 --- a/libvpx/vp8/common/mips/dspr2/filter_dspr2.c +++ b/libvpx/vp8/common/mips/dspr2/filter_dspr2.c @@ -673,9 +673,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr, : [tn1] "=&r"(tn1), [tp2] "=&r"(tp2), [n2] "=&r"(n2), [p4] "=&r"(p4), [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4) + [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p1] "+r"(p1) : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1), - [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a), [p2] "r"(p2), + [n1] "r"(n1), [vector4a] "r"(vector4a), [p2] "r"(p2), [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3), [src_ptr] "r"(src_ptr)); @@ -724,9 +724,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr, : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [n1] "=&r"(n1), [p3] "=&r"(p3), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4) + [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p4] "+r"(p4) : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp2] "r"(tp2), - [p2] "r"(p2), [n2] "r"(n2), [p4] "r"(p4), [n4] "r"(n4), [p1] "r"(p1), + [p2] "r"(p2), [n2] "r"(n2), [n4] "r"(n4), [p1] "r"(p1), [src_ptr] "r"(src_ptr), [vector4a] "r"(vector4a), [vector3b] "r"(vector3b)); @@ -781,9 +781,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr, : [tn1] "=&r"(tn1), [p2] "=&r"(p2), [n2] "=&r"(n2), [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), - [Temp4] "=r"(Temp4) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1), - [p4] "r"(p4), [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a), + [Temp4] "=r"(Temp4), [tp1] "+r"(tp1) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [p4] "r"(p4), + [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a), [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3), [src_ptr] "r"(src_ptr), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); diff --git a/libvpx/vp8/common/mips/mmi/copymem_mmi.c b/libvpx/vp8/common/mips/mmi/copymem_mmi.c new file mode 100644 index 000000000..86a32aa9e --- /dev/null +++ b/libvpx/vp8/common/mips/mmi/copymem_mmi.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_ports/asmdefs_mmi.h" + +#define COPY_MEM_16X2 \ + "gsldlc1 %[ftmp0], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp0], 0x00(%[src]) \n\t" \ + "ldl %[tmp0], 0x0f(%[src]) \n\t" \ + "ldr %[tmp0], 0x08(%[src]) \n\t" \ + MMI_ADDU(%[src], %[src], %[src_stride]) \ + "gssdlc1 %[ftmp0], 0x07(%[dst]) \n\t" \ + "gssdrc1 %[ftmp0], 0x00(%[dst]) \n\t" \ + "sdl %[tmp0], 0x0f(%[dst]) \n\t" \ + "sdr %[tmp0], 0x08(%[dst]) \n\t" \ + MMI_ADDU(%[dst], %[dst], %[dst_stride]) \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "ldl %[tmp1], 0x0f(%[src]) \n\t" \ + "ldr %[tmp1], 0x08(%[src]) \n\t" \ + MMI_ADDU(%[src], %[src], %[src_stride]) \ + "gssdlc1 %[ftmp1], 0x07(%[dst]) \n\t" \ + "gssdrc1 %[ftmp1], 0x00(%[dst]) \n\t" \ + "sdl %[tmp1], 0x0f(%[dst]) \n\t" \ + "sdr %[tmp1], 0x08(%[dst]) \n\t" \ + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + +#define COPY_MEM_8X2 \ + "gsldlc1 %[ftmp0], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp0], 0x00(%[src]) \n\t" \ + MMI_ADDU(%[src], %[src], %[src_stride]) \ + "ldl %[tmp0], 0x07(%[src]) \n\t" \ + "ldr %[tmp0], 0x00(%[src]) \n\t" \ + MMI_ADDU(%[src], %[src], %[src_stride]) \ + \ + "gssdlc1 %[ftmp0], 0x07(%[dst]) \n\t" \ + "gssdrc1 %[ftmp0], 0x00(%[dst]) \n\t" \ + MMI_ADDU(%[dst], %[dst], %[dst_stride]) \ + "sdl %[tmp0], 0x07(%[dst]) \n\t" \ + "sdr %[tmp0], 0x00(%[dst]) \n\t" \ + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + +void vp8_copy_mem16x16_mmi(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride) { + double ftmp[2]; + uint64_t tmp[2]; + uint8_t loop_count = 4; + + /* clang-format off */ + __asm__ volatile ( + "1: \n\t" + COPY_MEM_16X2 + COPY_MEM_16X2 + MMI_ADDIU(%[loop_count], %[loop_count], -0x01) + "bnez %[loop_count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [loop_count]"+&r"(loop_count), + [dst]"+&r"(dst), [src]"+&r"(src) + : [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + /* clang-format on */ +} + +void vp8_copy_mem8x8_mmi(unsigned char *src, int src_stride, unsigned char *dst, + int dst_stride) { + double ftmp[2]; + uint64_t tmp[1]; + uint8_t loop_count = 4; + + /* clang-format off */ + __asm__ volatile ( + "1: \n\t" + COPY_MEM_8X2 + MMI_ADDIU(%[loop_count], %[loop_count], -0x01) + "bnez %[loop_count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [tmp0]"=&r"(tmp[0]), [loop_count]"+&r"(loop_count), + [dst]"+&r"(dst), [src]"+&r"(src) + : [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + /* clang-format on */ +} + +void vp8_copy_mem8x4_mmi(unsigned char *src, int src_stride, unsigned char *dst, + int dst_stride) { + double ftmp[2]; + uint64_t tmp[1]; + + /* clang-format off */ + __asm__ volatile ( + COPY_MEM_8X2 + COPY_MEM_8X2 + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [tmp0]"=&r"(tmp[0]), + [dst]"+&r"(dst), [src]"+&r"(src) + : [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + /* clang-format on */ +} diff --git a/libvpx/vp8/common/mips/mmi/dequantize_mmi.c b/libvpx/vp8/common/mips/mmi/dequantize_mmi.c new file mode 100644 index 000000000..b3f8084ae --- /dev/null +++ b/libvpx/vp8/common/mips/mmi/dequantize_mmi.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/blockd.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/asmdefs_mmi.h" + +void vp8_dequantize_b_mmi(BLOCKD *d, int16_t *DQC) { + double ftmp[8]; + + __asm__ volatile( + "gsldlc1 %[ftmp0], 0x07(%[qcoeff]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[qcoeff]) \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[qcoeff]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[qcoeff]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[qcoeff]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[qcoeff]) \n\t" + "gsldlc1 %[ftmp3], 0x1f(%[qcoeff]) \n\t" + "gsldrc1 %[ftmp3], 0x18(%[qcoeff]) \n\t" + + "gsldlc1 %[ftmp4], 0x07(%[DQC]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[DQC]) \n\t" + "gsldlc1 %[ftmp5], 0x0f(%[DQC]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[DQC]) \n\t" + "gsldlc1 %[ftmp6], 0x17(%[DQC]) \n\t" + "gsldrc1 %[ftmp6], 0x10(%[DQC]) \n\t" + "gsldlc1 %[ftmp7], 0x1f(%[DQC]) \n\t" + "gsldrc1 %[ftmp7], 0x18(%[DQC]) \n\t" + + "pmullh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + + "gssdlc1 %[ftmp0], 0x07(%[dqcoeff]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[dqcoeff]) \n\t" + "gssdlc1 %[ftmp1], 0x0f(%[dqcoeff]) \n\t" + "gssdrc1 %[ftmp1], 0x08(%[dqcoeff]) \n\t" + "gssdlc1 %[ftmp2], 0x17(%[dqcoeff]) \n\t" + "gssdrc1 %[ftmp2], 0x10(%[dqcoeff]) \n\t" + "gssdlc1 %[ftmp3], 0x1f(%[dqcoeff]) \n\t" + "gssdrc1 %[ftmp3], 0x18(%[dqcoeff]) \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]) + : [dqcoeff] "r"(d->dqcoeff), [qcoeff] "r"(d->qcoeff), [DQC] "r"(DQC) + : "memory"); +} + +void vp8_dequant_idct_add_mmi(int16_t *input, int16_t *dq, unsigned char *dest, + int stride) { + double ftmp[8]; + + __asm__ volatile( + "gsldlc1 %[ftmp0], 0x07(%[dq]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[dq]) \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[dq]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[dq]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[dq]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[dq]) \n\t" + "gsldlc1 %[ftmp3], 0x1f(%[dq]) \n\t" + "gsldrc1 %[ftmp3], 0x18(%[dq]) \n\t" + + "gsldlc1 %[ftmp4], 0x07(%[input]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[input]) \n\t" + "gsldlc1 %[ftmp5], 0x0f(%[input]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[input]) \n\t" + "gsldlc1 %[ftmp6], 0x17(%[input]) \n\t" + "gsldrc1 %[ftmp6], 0x10(%[input]) \n\t" + "gsldlc1 %[ftmp7], 0x1f(%[input]) \n\t" + "gsldrc1 %[ftmp7], 0x18(%[input]) \n\t" + + "pmullh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + + "gssdlc1 %[ftmp0], 0x07(%[input]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[input]) \n\t" + "gssdlc1 %[ftmp1], 0x0f(%[input]) \n\t" + "gssdrc1 %[ftmp1], 0x08(%[input]) \n\t" + "gssdlc1 %[ftmp2], 0x17(%[input]) \n\t" + "gssdrc1 %[ftmp2], 0x10(%[input]) \n\t" + "gssdlc1 %[ftmp3], 0x1f(%[input]) \n\t" + "gssdrc1 %[ftmp3], 0x18(%[input]) \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]) + : [dq] "r"(dq), [input] "r"(input) + : "memory"); + + vp8_short_idct4x4llm_mmi(input, dest, stride, dest, stride); + + __asm__ volatile( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[input]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[input]) \n\t" + "sdl $0, 0x0f(%[input]) \n\t" + "sdr $0, 0x08(%[input]) \n\t" + "gssdlc1 %[ftmp0], 0x17(%[input]) \n\t" + "gssdrc1 %[ftmp0], 0x10(%[input]) \n\t" + "sdl $0, 0x1f(%[input]) \n\t" + "sdr $0, 0x18(%[input]) \n\t" + : [ftmp0] "=&f"(ftmp[0]) + : [input] "r"(input) + : "memory"); +} diff --git a/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c b/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c new file mode 100644 index 000000000..f6020ab46 --- /dev/null +++ b/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_mem/vpx_mem.h" + +void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst, + int stride, int8_t *eobs) { + int i, j; + + for (i = 0; i < 4; i++) { + for (j = 0; j < 4; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dst, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst, stride, dst, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dst += 4; + } + + dst += 4 * stride - 16; + } +} + +void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dstu, + uint8_t *dstv, int stride, + int8_t *eobs) { + int i, j; + + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dstu, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dstu, stride, dstu, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dstu += 4; + } + + dstu += 4 * stride - 8; + } + + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dstv, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dstv, stride, dstv, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dstv += 4; + } + + dstv += 4 * stride - 8; + } +} diff --git a/libvpx/vp8/common/mips/mmi/idctllm_mmi.c b/libvpx/vp8/common/mips/mmi/idctllm_mmi.c new file mode 100644 index 000000000..5e48f5916 --- /dev/null +++ b/libvpx/vp8/common/mips/mmi/idctllm_mmi.c @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/asmdefs_mmi.h" + +#define TRANSPOSE_4H \ + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ + MMI_LI(%[tmp0], 0x93) \ + "mtc1 %[tmp0], %[ftmp10] \n\t" \ + "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "or %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \ + "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "or %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \ + "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "or %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \ + "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "or %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \ + "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \ + "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \ + "punpckhwd %[ftmp4], %[ftmp6], %[ftmp8] \n\t" + +void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) { + double ftmp[12]; + uint32_t tmp[0]; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_04) = { 0x0004000400040004ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = { 0x4e7b4e7b4e7b4e7bULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = { 0x22a322a322a322a3ULL }; + + __asm__ volatile ( + MMI_LI(%[tmp0], 0x02) + "mtc1 %[tmp0], %[ftmp11] \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + + "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ip]) \n\t" + "gsldlc1 %[ftmp3], 0x17(%[ip]) \n\t" + "gsldrc1 %[ftmp3], 0x10(%[ip]) \n\t" + "gsldlc1 %[ftmp4], 0x1f(%[ip]) \n\t" + "gsldrc1 %[ftmp4], 0x18(%[ip]) \n\t" + + // ip[0...3] + ip[8...11] + "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + // ip[0...3] - ip[8...11] + "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t" + // (ip[12...15] * sinpi8sqrt2) >> 16 + "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t" + "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t" + // (ip[ 4... 7] * sinpi8sqrt2) >> 16 + "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t" + "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t" + // ip[ 4... 7] + ((ip[ 4... 7] * cospi8sqrt2minus1) >> 16) + "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t" + "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t" + // ip[12...15] + ((ip[12...15] * cospi8sqrt2minus1) >> 16) + "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t" + "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t" + + "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t" + "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t" + "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t" + + TRANSPOSE_4H + // a + "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + // b + "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t" + // c + "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t" + "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t" + "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t" + "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t" + // d + "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t" + "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t" + "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t" + "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t" + "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" + + MMI_LI(%[tmp0], 0x03) + "mtc1 %[tmp0], %[ftmp11] \n\t" + // a + d + "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_ph_04] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + // b + c + "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_ph_04] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + // b - c + "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_ph_04] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + // a - d + "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ff_ph_04] \n\t" + "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + + TRANSPOSE_4H +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[pred_prt]) \n\t" + "mtc1 %[tmp0], %[ftmp5] \n\t" +#else + "gslwlc1 %[ftmp5], 0x03(%[pred_ptr]) \n\t" + "gslwrc1 %[ftmp5], 0x00(%[pred_ptr]) \n\t" +#endif + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[pred_prt]) \n\t" + "mtc1 %[tmp0], %[ftmp6] \n\t" +#else + "gslwlc1 %[ftmp6], 0x03(%[pred_ptr]) \n\t" + "gslwrc1 %[ftmp6], 0x00(%[pred_ptr]) \n\t" +#endif + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp2], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp2], 0x00(%[dst_ptr]) \n\t" + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[pred_prt]) \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" +#else + "gslwlc1 %[ftmp7], 0x03(%[pred_ptr]) \n\t" + "gslwrc1 %[ftmp7], 0x00(%[pred_ptr]) \n\t" +#endif + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "gsswlc1 %[ftmp3], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp3], 0x00(%[dst_ptr]) \n\t" + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[pred_prt]) \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" +#else + "gslwlc1 %[ftmp8], 0x03(%[pred_ptr]) \n\t" + "gslwrc1 %[ftmp8], 0x00(%[pred_ptr]) \n\t" +#endif + "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "gsswlc1 %[ftmp4], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp4], 0x00(%[dst_ptr]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), + [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]), + [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), + [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), + [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr) + : [ip]"r"(input), [ff_ph_22a3]"f"(ff_ph_22a3), + [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_04]"f"(ff_ph_04), + [pred_stride]"r"((mips_reg)pred_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); +} + +void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) { + int a1 = ((input_dc + 4) >> 3); + double ftmp[5]; + int low32; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pshufh %[a1], %[a1], %[ftmp0] \n\t" + "ulw %[low32], 0x00(%[pred_ptr]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + "ulw %[low32], 0x00(%[pred_ptr]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + "ulw %[low32], 0x00(%[pred_ptr]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + "ulw %[low32], 0x00(%[pred_ptr]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), + [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32), + [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr) + : [dst_stride]"r"((mips_reg)dst_stride), + [pred_stride]"r"((mips_reg)pred_stride), [a1]"f"(a1) + : "memory" + ); +} + +void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) { + int i; + int16_t output[16]; + double ftmp[12]; + uint32_t tmp[1]; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_03) = { 0x0003000300030003ULL }; + + __asm__ volatile ( + MMI_LI(%[tmp0], 0x03) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ip]) \n\t" + "gsldlc1 %[ftmp3], 0x17(%[ip]) \n\t" + "gsldrc1 %[ftmp3], 0x10(%[ip]) \n\t" + "gsldlc1 %[ftmp4], 0x1f(%[ip]) \n\t" + "gsldrc1 %[ftmp4], 0x18(%[ip]) \n\t" + "paddh %[ftmp5], %[ftmp1], %[ftmp2] \n\t" + "psubh %[ftmp6], %[ftmp1], %[ftmp2] \n\t" + "paddh %[ftmp7], %[ftmp3], %[ftmp4] \n\t" + "psubh %[ftmp8], %[ftmp3], %[ftmp4] \n\t" + + "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp2], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t" + "paddh %[ftmp4], %[ftmp6], %[ftmp8] \n\t" + + TRANSPOSE_4H + // a + "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t" + // d + "psubh %[ftmp6], %[ftmp1], %[ftmp4] \n\t" + // b + "paddh %[ftmp7], %[ftmp2], %[ftmp3] \n\t" + // c + "psubh %[ftmp8], %[ftmp2], %[ftmp3] \n\t" + + "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t" + "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t" + "psubh %[ftmp3], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp4], %[ftmp6], %[ftmp8] \n\t" + + "paddh %[ftmp1], %[ftmp1], %[ff_ph_03] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_ph_03] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_ph_03] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ff_ph_03] \n\t" + "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + + TRANSPOSE_4H + "gssdlc1 %[ftmp1], 0x07(%[op]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[op]) \n\t" + "gssdlc1 %[ftmp2], 0x0f(%[op]) \n\t" + "gssdrc1 %[ftmp2], 0x08(%[op]) \n\t" + "gssdlc1 %[ftmp3], 0x17(%[op]) \n\t" + "gssdrc1 %[ftmp3], 0x10(%[op]) \n\t" + "gssdlc1 %[ftmp4], 0x1f(%[op]) \n\t" + "gssdrc1 %[ftmp4], 0x18(%[op]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), + [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]), + [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), + [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]) + : [ip]"r"(input), [op]"r"(output), [ff_ph_03]"f"(ff_ph_03) + : "memory" + ); + + for (i = 0; i < 16; i++) { + mb_dqcoeff[i * 16] = output[i]; + } +} diff --git a/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c b/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c new file mode 100644 index 000000000..f2182f95c --- /dev/null +++ b/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c @@ -0,0 +1,1337 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vp8/common/loopfilter.h" +#include "vp8/common/onyxc_int.h" +#include "vpx_ports/asmdefs_mmi.h" + +DECLARE_ALIGNED(8, static const uint64_t, ff_ph_01) = { 0x0001000100010001ULL }; +DECLARE_ALIGNED(8, static const uint64_t, + ff_ph_003f) = { 0x003f003f003f003fULL }; +DECLARE_ALIGNED(8, static const uint64_t, + ff_ph_0900) = { 0x0900090009000900ULL }; +DECLARE_ALIGNED(8, static const uint64_t, + ff_ph_1200) = { 0x1200120012001200ULL }; +DECLARE_ALIGNED(8, static const uint64_t, + ff_ph_1b00) = { 0x1b001b001b001b00ULL }; +DECLARE_ALIGNED(8, static const uint64_t, ff_pb_fe) = { 0xfefefefefefefefeULL }; +DECLARE_ALIGNED(8, static const uint64_t, ff_pb_80) = { 0x8080808080808080ULL }; +DECLARE_ALIGNED(8, static const uint64_t, ff_pb_04) = { 0x0404040404040404ULL }; +DECLARE_ALIGNED(8, static const uint64_t, ff_pb_03) = { 0x0303030303030303ULL }; +DECLARE_ALIGNED(8, static const uint64_t, ff_pb_01) = { 0x0101010101010101ULL }; + +void vp8_loop_filter_horizontal_edge_mmi( + unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, + const unsigned char *limit, const unsigned char *thresh, int count) { + uint32_t tmp[1]; + mips_reg addr[2]; + double ftmp[12]; + __asm__ volatile ( + "1: \n\t" + "gsldlc1 %[ftmp10], 0x07(%[limit]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[limit]) \n\t" + + MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step]) + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4]) + "gsldlc1 %[ftmp1], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr1]) \n\t" + + MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4]) + "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp0], %[ftmp1], %[ftmp3] \n\t" + "psubusb %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsldlc1 %[ftmp4], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp5], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp9], %[ftmp4], %[ftmp5] \n\t" + "psubusb %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + + "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t" + "pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t" + "psubusb %[ftmp1], %[ftmp11], %[ftmp10] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsldlc1 %[ftmp8], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2]) + "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t" + "and %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t" + "li %[tmp0], 0x01 \n\t" + "mtc1 %[tmp0], %[ftmp10] \n\t" + "psrlh %[ftmp2], %[ftmp2], %[ftmp10] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "gsldlc1 %[ftmp10], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[blimit]) \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + + "gsldlc1 %[ftmp10], 0x07(%[thresh]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[thresh]) \n\t" + "psubusb %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + "psubusb %[ftmp2], %[ftmp11], %[ftmp10] \n\t" + "paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "xor %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "xor %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + + "xor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + + "psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t" + "and %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "psubsb %[ftmp3], %[ftmp6], %[ftmp5] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "and %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + + "paddsb %[ftmp8], %[ftmp2], %[ff_pb_03] \n\t" + "paddsb %[ftmp9], %[ftmp2], %[ff_pb_04] \n\t" + + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp11], %[ftmp11], %[ftmp11] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp8] \n\t" + + "li %[tmp0], 0x0b \n\t" + "mtc1 %[tmp0], %[ftmp10] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t" + "packsshb %[ftmp8], %[ftmp0], %[ftmp11] \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + "xor %[ftmp11], %[ftmp11], %[ftmp11] \n\t" + "punpckhbh %[ftmp9], %[ftmp11], %[ftmp9] \n\t" + "psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t" + "paddsh %[ftmp11], %[ftmp0], %[ff_ph_01] \n\t" + "packsshb %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "paddsh %[ftmp9], %[ftmp9], %[ff_ph_01] \n\t" + + "li %[tmp0], 0x01 \n\t" + "mtc1 %[tmp0], %[ftmp10] \n\t" + "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t" + "psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t" + "packsshb %[ftmp11], %[ftmp11], %[ftmp9] \n\t" + "pandn %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + "paddsb %[ftmp5], %[ftmp5], %[ftmp8] \n\t" + "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp5], 0x07(%[addr1]) \n\t" + "gssdrc1 %[ftmp5], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t" + "xor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "gssdlc1 %[ftmp4], 0x07(%[addr1]) \n\t" + "gssdrc1 %[ftmp4], 0x00(%[addr1]) \n\t" + + "psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "gssdlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + + "psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + "gssdlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gssdrc1 %[ftmp7], 0x00(%[addr0]) \n\t" + + "addiu %[count], %[count], -0x01 \n\t" + MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08) + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count) + : [limit]"r"(limit), [blimit]"r"(blimit), + [thresh]"r"(thresh), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)), + [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)), + [ff_ph_01]"f"(ff_ph_01), [ff_pb_fe]"f"(ff_pb_fe), + [ff_pb_80]"f"(ff_pb_80), [ff_pb_04]"f"(ff_pb_04), + [ff_pb_03]"f"(ff_pb_03) + : "memory" + ); +} + +void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, + int src_pixel_step, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, int count) { + uint32_t tmp[1]; + mips_reg addr[2]; + double ftmp[13]; + + __asm__ volatile ( + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) + MMI_SUBU(%[src_ptr], %[src_ptr], 0x04) + + "1: \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step]) + + MMI_SLL (%[tmp0], %[src_pixel_step], 0x01) + MMI_ADDU(%[addr1], %[src_ptr], %[tmp0]) + "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t" + MMI_ADDU(%[addr1], %[addr0], %[tmp0]) + "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp1], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp2], %[ftmp11], %[ftmp12] \n\t" + + "gsldlc1 %[ftmp11], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp12], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[addr0]) \n\t" + "punpcklbh %[ftmp3], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp4], %[ftmp11], %[ftmp12] \n\t" + + "punpcklhw %[ftmp5], %[ftmp4], %[ftmp2] \n\t" + "punpckhhw %[ftmp6], %[ftmp4], %[ftmp2] \n\t" + "punpcklhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t" + "punpckhhw %[ftmp8], %[ftmp3], %[ftmp1] \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x01) + MMI_SUBU(%[addr1], %[src_ptr], %[tmp0]) + "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp9], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp10], %[ftmp11], %[ftmp12] \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[addr1], %[src_ptr], %[tmp0]) + "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[addr1], %[addr0], %[tmp0]) + "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp0], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp12] \n\t" + + "punpcklhw %[ftmp1], %[ftmp11], %[ftmp10] \n\t" + "punpckhhw %[ftmp2], %[ftmp11], %[ftmp10] \n\t" + "punpcklhw %[ftmp3], %[ftmp0], %[ftmp9] \n\t" + "punpckhhw %[ftmp4], %[ftmp0], %[ftmp9] \n\t" + + /* ftmp9:q0 ftmp10:q1 */ + "punpcklwd %[ftmp9], %[ftmp1], %[ftmp5] \n\t" + "punpckhwd %[ftmp10], %[ftmp1], %[ftmp5] \n\t" + /* ftmp11:q2 ftmp12:q3 */ + "punpcklwd %[ftmp11], %[ftmp2], %[ftmp6] \n\t" + "punpckhwd %[ftmp12], %[ftmp2], %[ftmp6] \n\t" + /* ftmp1:p3 ftmp2:p2 */ + "punpcklwd %[ftmp1], %[ftmp3], %[ftmp7] \n\t" + "punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t" + /* ftmp5:p1 ftmp6:p0 */ + "punpcklwd %[ftmp5], %[ftmp4], %[ftmp8] \n\t" + "punpckhwd %[ftmp6], %[ftmp4], %[ftmp8] \n\t" + + "gsldlc1 %[ftmp8], 0x07(%[limit]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[limit]) \n\t" + + /* abs (q3-q2) */ + "pasubub %[ftmp7], %[ftmp12], %[ftmp11] \n\t" + "psubusb %[ftmp0], %[ftmp7], %[ftmp8] \n\t" + /* abs (q2-q1) */ + "pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* ftmp3: abs(q1-q0) */ + "pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t" + "psubusb %[ftmp7], %[ftmp3], %[ftmp8] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* ftmp4: abs(p1-p0) */ + "pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t" + "psubusb %[ftmp7], %[ftmp4], %[ftmp8] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* abs (p2-p1) */ + "pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* abs (p3-p2) */ + "pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + + "gsldlc1 %[ftmp8], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[blimit]) \n\t" + + /* abs (p0-q0) */ + "pasubub %[ftmp11], %[ftmp9], %[ftmp6] \n\t" + "paddusb %[ftmp11], %[ftmp11], %[ftmp11] \n\t" + /* abs (p1-q1) */ + "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t" + "and %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t" + "li %[tmp0], 0x01 \n\t" + "mtc1 %[tmp0], %[ftmp1] \n\t" + "psrlh %[ftmp12], %[ftmp12], %[ftmp1] \n\t" + "paddusb %[ftmp1], %[ftmp11], %[ftmp12] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp8] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "xor %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + /* ftmp0:mask */ + "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "gsldlc1 %[ftmp8], 0x07(%[thresh]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[thresh]) \n\t" + + /* ftmp3: abs(q1-q0) ftmp4: abs(p1-p0) */ + "psubusb %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + "psubusb %[ftmp3], %[ftmp3], %[ftmp8] \n\t" + "or %[ftmp2], %[ftmp4], %[ftmp3] \n\t" + "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + /* ftmp1:hev */ + "xor %[ftmp1], %[ftmp2], %[ftmp1] \n\t" + + "xor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" + "xor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" + "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + + "psubsb %[ftmp2], %[ftmp5], %[ftmp10] \n\t" + "and %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "psubsb %[ftmp3], %[ftmp9], %[ftmp6] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + /* ftmp2:filter_value */ + "and %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + + "paddsb %[ftmp11], %[ftmp2], %[ff_pb_04] \n\t" + "paddsb %[ftmp12], %[ftmp2], %[ff_pb_03] \n\t" + + "li %[tmp0], 0x0b \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp12] \n\t" + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp12] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packsshb %[ftmp12], %[ftmp0], %[ftmp8] \n\t" + + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp11] \n\t" + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp11] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packsshb %[ftmp11], %[ftmp0], %[ftmp8] \n\t" + + "psubsb %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "xor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" + "paddsb %[ftmp6], %[ftmp6], %[ftmp12] \n\t" + "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "paddsh %[ftmp0], %[ftmp0], %[ff_ph_01] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ff_ph_01] \n\t" + + "li %[tmp0], 0x01 \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packsshb %[ftmp2], %[ftmp0], %[ftmp8] \n\t" + "pandn %[ftmp2], %[ftmp1], %[ftmp2] \n\t" + "psubsb %[ftmp10], %[ftmp10], %[ftmp2] \n\t" + "xor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" + "paddsb %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + + /* ftmp5: *op1 ; ftmp6: *op0 */ + "punpcklbh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + "punpckhbh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + /* ftmp9: *oq0 ; ftmp10: *oq1 */ + "punpcklbh %[ftmp4], %[ftmp9], %[ftmp10] \n\t" + "punpckhbh %[ftmp3], %[ftmp9], %[ftmp10] \n\t" + "punpckhhw %[ftmp6], %[ftmp2], %[ftmp4] \n\t" + "punpcklhw %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "punpckhhw %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + "punpcklhw %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[addr1], %[src_ptr], %[tmp0]) + "gsswlc1 %[ftmp2], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp2], 0x02(%[addr1]) \n\t" + + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "dsrl %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[addr1], %[addr0], %[tmp0]) + "gsswlc1 %[ftmp2], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp2], 0x02(%[addr1]) \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x01) + MMI_SUBU(%[addr1], %[src_ptr], %[tmp0]) + "gsswlc1 %[ftmp6], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp6], 0x02(%[addr1]) \n\t" + + "dsrl %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsswlc1 %[ftmp6], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp6], 0x02(%[addr1]) \n\t" + "gsswlc1 %[ftmp1], 0x05(%[src_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x02(%[src_ptr]) \n\t" + + "dsrl %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "gsswlc1 %[ftmp1], 0x05(%[addr0]) \n\t" + "gsswrc1 %[ftmp1], 0x02(%[addr0]) \n\t" + MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step]) + "gsswlc1 %[ftmp5], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp5], 0x02(%[addr1]) \n\t" + + "dsrl %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + MMI_ADDU(%[addr1], %[addr0], %[tmp0]) + "gsswlc1 %[ftmp5], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp5], 0x02(%[addr1]) \n\t" + + MMI_ADDIU(%[count], %[count], -0x01) + MMI_SLL(%[tmp0], %[src_pixel_step], 0x03) + MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count) + : [limit]"r"(limit), [blimit]"r"(blimit), + [thresh]"r"(thresh), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [ff_ph_01]"f"(ff_ph_01), [ff_pb_03]"f"(ff_pb_03), + [ff_pb_04]"f"(ff_pb_04), [ff_pb_80]"f"(ff_pb_80), + [ff_pb_fe]"f"(ff_pb_fe) + : "memory" + ); +} + +/* clang-format off */ +#define VP8_MBLOOP_HPSRAB \ + "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" \ + "psrah %[ftmp10], %[ftmp10], %[ftmp9] \n\t" \ + "psrah %[ftmp11], %[ftmp11], %[ftmp9] \n\t" \ + "packsshb %[ftmp0], %[ftmp10], %[ftmp11] \n\t" + +#define VP8_MBLOOP_HPSRAB_ADD(reg) \ + "punpcklbh %[ftmp1], %[ftmp0], %[ftmp12] \n\t" \ + "punpckhbh %[ftmp2], %[ftmp0], %[ftmp12] \n\t" \ + "pmulhh %[ftmp1], %[ftmp1], " #reg " \n\t" \ + "pmulhh %[ftmp2], %[ftmp2], " #reg " \n\t" \ + "paddh %[ftmp1], %[ftmp1], %[ff_ph_003f] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_003f] \n\t" \ + "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \ + "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t" \ + "packsshb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" +/* clang-format on */ + +void vp8_mbloop_filter_horizontal_edge_mmi( + unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, + const unsigned char *limit, const unsigned char *thresh, int count) { + uint32_t tmp[1]; + double ftmp[13]; + + __asm__ volatile ( + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) + "1: \n\t" + "gsldlc1 %[ftmp9], 0x07(%[limit]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[limit]) \n\t" + /* ftmp1: p3 */ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + /* ftmp3: p2 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t" + /* ftmp4: p1 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp4], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t" + /* ftmp5: p0 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t" + /* ftmp6: q0 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + /* ftmp7: q1 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" + /* ftmp8: q2 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" + /* ftmp2: q3 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp2], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[src_ptr]) \n\t" + + "gsldlc1 %[ftmp12], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[blimit]) \n\t" + + "pasubub %[ftmp0], %[ftmp1], %[ftmp3] \n\t" + "psubusb %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pasubub %[ftmp10], %[ftmp4], %[ftmp5] \n\t" + "psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t" + "psubusb %[ftmp1], %[ftmp11], %[ftmp9] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t" + "and %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t" + "li %[tmp0], 0x01 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp12] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + /* ftmp0: mask */ + "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + + "gsldlc1 %[ftmp9], 0x07(%[thresh]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[thresh]) \n\t" + "psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t" + "psubusb %[ftmp2], %[ftmp11], %[ftmp9] \n\t" + "paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "xor %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + /* ftmp1: hev */ + "xor %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + + "xor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + "psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t" + "psubsb %[ftmp9], %[ftmp6], %[ftmp5] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "and %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pandn %[ftmp12], %[ftmp1], %[ftmp2] \n\t" + "and %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + + "li %[tmp0], 0x0b \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "paddsb %[ftmp0], %[ftmp2], %[ff_pb_03] \n\t" + VP8_MBLOOP_HPSRAB + "paddsb %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "paddsb %[ftmp0], %[ftmp2], %[ff_pb_04] \n\t" + VP8_MBLOOP_HPSRAB + "psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + + "li %[tmp0], 0x07 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + + VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00]) + "psubsb %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "paddsb %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) + "gssdlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + + VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1200]) + "paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t" + "psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "xor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp4], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t" + + VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_0900]) + "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "xor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + "psubsb %[ftmp8], %[ftmp8], %[ftmp1] \n\t" + "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "xor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) + "gssdlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t" + + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08) + "addiu %[count], %[count], -0x01 \n\t" + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count) + : [limit]"r"(limit), [blimit]"r"(blimit), + [thresh]"r"(thresh), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [ff_pb_fe]"f"(ff_pb_fe), [ff_pb_80]"f"(ff_pb_80), + [ff_pb_04]"f"(ff_pb_04), [ff_pb_03]"f"(ff_pb_03), + [ff_ph_0900]"f"(ff_ph_0900), [ff_ph_1b00]"f"(ff_ph_1b00), + [ff_ph_1200]"f"(ff_ph_1200), [ff_ph_003f]"f"(ff_ph_003f) + : "memory" + ); +} + +#define VP8_MBLOOP_VPSRAB_ADDH \ + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \ + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" \ + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" + +#define VP8_MBLOOP_VPSRAB_ADDT \ + "paddh %[ftmp7], %[ftmp7], %[ff_ph_003f] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ff_ph_003f] \n\t" \ + "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" \ + "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" \ + "packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t" + +void vp8_mbloop_filter_vertical_edge_mmi( + unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, + const unsigned char *limit, const unsigned char *thresh, int count) { + mips_reg tmp[1]; + DECLARE_ALIGNED(8, const uint64_t, srct[1]); + double ftmp[14]; + + __asm__ volatile ( + MMI_SUBU(%[src_ptr], %[src_ptr], 0x04) + + "1: \n\t" + "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp11], %[ftmp5], %[ftmp6] \n\t" + "punpckhbh %[ftmp12], %[ftmp5], %[ftmp6] \n\t" + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp8] \n\t" + "punpckhbh %[ftmp10], %[ftmp7], %[ftmp8] \n\t" + + "punpcklhw %[ftmp1], %[ftmp12], %[ftmp10] \n\t" + "punpckhhw %[ftmp2], %[ftmp12], %[ftmp10] \n\t" + "punpcklhw %[ftmp3], %[ftmp11], %[ftmp9] \n\t" + "punpckhhw %[ftmp4], %[ftmp11], %[ftmp9] \n\t" + + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp11], %[ftmp5], %[ftmp6] \n\t" + "punpckhbh %[ftmp12], %[ftmp5], %[ftmp6] \n\t" + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp8] \n\t" + "punpckhbh %[ftmp10], %[ftmp7], %[ftmp8] \n\t" + + "punpcklhw %[ftmp5], %[ftmp12], %[ftmp10] \n\t" + "punpckhhw %[ftmp6], %[ftmp12], %[ftmp10] \n\t" + "punpcklhw %[ftmp7], %[ftmp11], %[ftmp9] \n\t" + "punpckhhw %[ftmp8], %[ftmp11], %[ftmp9] \n\t" + + "gsldlc1 %[ftmp13], 0x07(%[limit]) \n\t" + "gsldrc1 %[ftmp13], 0x00(%[limit]) \n\t" + /* ftmp9:q0 ftmp10:q1 */ + "punpcklwd %[ftmp9], %[ftmp1], %[ftmp5] \n\t" + "punpckhwd %[ftmp10], %[ftmp1], %[ftmp5] \n\t" + /* ftmp11:q2 ftmp12:q3 */ + "punpcklwd %[ftmp11], %[ftmp2], %[ftmp6] \n\t" + "punpckhwd %[ftmp12], %[ftmp2], %[ftmp6] \n\t" + /* srct[0x00]: q3 */ + "sdc1 %[ftmp12], 0x00(%[srct]) \n\t" + /* ftmp1:p3 ftmp2:p2 */ + "punpcklwd %[ftmp1], %[ftmp3], %[ftmp7] \n\t" + "punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t" + /* srct[0x08]: p3 */ + "sdc1 %[ftmp1], 0x08(%[srct]) \n\t" + /* ftmp5:p1 ftmp6:p0 */ + "punpcklwd %[ftmp5], %[ftmp4], %[ftmp8] \n\t" + "punpckhwd %[ftmp6], %[ftmp4], %[ftmp8] \n\t" + + /* abs (q3-q2) */ + "pasubub %[ftmp7], %[ftmp12], %[ftmp11] \n\t" + "psubusb %[ftmp0], %[ftmp7], %[ftmp13] \n\t" + /* abs (q2-q1) */ + "pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* ftmp3: abs(q1-q0) */ + "pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t" + "psubusb %[ftmp7], %[ftmp3], %[ftmp13] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* ftmp4: abs(p1-p0) */ + "pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t" + "psubusb %[ftmp7], %[ftmp4], %[ftmp13] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* abs (p2-p1) */ + "pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* abs (p3-p2) */ + "pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + + "gsldlc1 %[ftmp13], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp13], 0x00(%[blimit]) \n\t" + "gsldlc1 %[ftmp7], 0x07(%[thresh]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[thresh]) \n\t" + /* abs (p0-q0) * 2 */ + "pasubub %[ftmp1], %[ftmp9], %[ftmp6] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + /* abs (p1-q1) / 2 */ + "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t" + "and %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t" + "li %[tmp0], 0x01 \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" + "psrlh %[ftmp12], %[ftmp12], %[ftmp8] \n\t" + "paddusb %[ftmp12], %[ftmp1], %[ftmp12] \n\t" + "psubusb %[ftmp12], %[ftmp12], %[ftmp13] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp12] \n\t" + "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + /* ftmp0: mask */ + "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp12] \n\t" + + /* abs(p1-p0) - thresh */ + "psubusb %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + /* abs(q1-q0) - thresh */ + "psubusb %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "or %[ftmp3], %[ftmp4], %[ftmp3] \n\t" + "pcmpeqb %[ftmp3], %[ftmp3], %[ftmp12] \n\t" + "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + /* ftmp1: hev */ + "xor %[ftmp1], %[ftmp3], %[ftmp1] \n\t" + + /* ftmp2:ps2, ftmp5:ps1, ftmp6:ps0, ftmp9:qs0, ftmp10:qs1, ftmp11:qs2 */ + "xor %[ftmp11], %[ftmp11], %[ff_pb_80] \n\t" + "xor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" + "xor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" + "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "xor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t" + + "psubsb %[ftmp3], %[ftmp5], %[ftmp10] \n\t" + "psubsb %[ftmp4], %[ftmp9], %[ftmp6] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + /* filter_value &= mask */ + "and %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + /* Filter2 = filter_value & hev */ + "and %[ftmp3], %[ftmp1], %[ftmp0] \n\t" + /* filter_value &= ~hev */ + "pandn %[ftmp0], %[ftmp1], %[ftmp0] \n\t" + + "paddsb %[ftmp4], %[ftmp3], %[ff_pb_04] \n\t" + "li %[tmp0], 0x0b \n\t" + "mtc1 %[tmp0], %[ftmp12] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp4] \n\t" + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" + "packsshb %[ftmp4], %[ftmp7], %[ftmp8] \n\t" + /* ftmp9: qs0 */ + "psubsb %[ftmp9], %[ftmp9], %[ftmp4] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ff_pb_03] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp3] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" + "packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t" + /* ftmp6: ps0 */ + "paddsb %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + + "li %[tmp0], 0x07 \n\t" + "mtc1 %[tmp0], %[ftmp12] \n\t" + VP8_MBLOOP_VPSRAB_ADDH + "paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_ph_0900] \n\t" + "pmulhh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "pmulhh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" + VP8_MBLOOP_VPSRAB_ADDT + "psubsb %[ftmp4], %[ftmp9], %[ftmp3] \n\t" + /* ftmp9: oq0 */ + "xor %[ftmp9], %[ftmp4], %[ff_pb_80] \n\t" + "paddsb %[ftmp4], %[ftmp6], %[ftmp3] \n\t" + /* ftmp6: op0 */ + "xor %[ftmp6], %[ftmp4], %[ff_pb_80] \n\t" + + VP8_MBLOOP_VPSRAB_ADDH + "paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t" + "pmulhh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "pmulhh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" + VP8_MBLOOP_VPSRAB_ADDT + "psubsb %[ftmp4], %[ftmp10], %[ftmp3] \n\t" + /* ftmp10: oq1 */ + "xor %[ftmp10], %[ftmp4], %[ff_pb_80] \n\t" + "paddsb %[ftmp4], %[ftmp5], %[ftmp3] \n\t" + /* ftmp5: op1 */ + "xor %[ftmp5], %[ftmp4], %[ff_pb_80] \n\t" + + VP8_MBLOOP_VPSRAB_ADDH + "pmulhh %[ftmp7], %[ftmp7], %[ff_ph_0900] \n\t" + "pmulhh %[ftmp8], %[ftmp8], %[ff_ph_0900] \n\t" + VP8_MBLOOP_VPSRAB_ADDT + "psubsb %[ftmp4], %[ftmp11], %[ftmp3] \n\t" + /* ftmp11: oq2 */ + "xor %[ftmp11], %[ftmp4], %[ff_pb_80] \n\t" + "paddsb %[ftmp4], %[ftmp2], %[ftmp3] \n\t" + /* ftmp2: op2 */ + "xor %[ftmp2], %[ftmp4], %[ff_pb_80] \n\t" + + "ldc1 %[ftmp12], 0x00(%[srct]) \n\t" + "ldc1 %[ftmp8], 0x08(%[srct]) \n\t" + + "punpcklbh %[ftmp0], %[ftmp8], %[ftmp2] \n\t" + "punpckhbh %[ftmp1], %[ftmp8], %[ftmp2] \n\t" + "punpcklbh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + "punpckhbh %[ftmp3], %[ftmp5], %[ftmp6] \n\t" + "punpcklhw %[ftmp4], %[ftmp0], %[ftmp2] \n\t" + "punpckhhw %[ftmp5], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp6], %[ftmp1], %[ftmp3] \n\t" + "punpckhhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t" + + "punpcklbh %[ftmp0], %[ftmp9], %[ftmp10] \n\t" + "punpckhbh %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + "punpcklbh %[ftmp2], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp3], %[ftmp11], %[ftmp12] \n\t" + "punpcklhw %[ftmp8], %[ftmp0], %[ftmp2] \n\t" + "punpckhhw %[ftmp9], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp10], %[ftmp1], %[ftmp3] \n\t" + "punpckhhw %[ftmp11], %[ftmp1], %[ftmp3] \n\t" + + "punpcklwd %[ftmp0], %[ftmp7], %[ftmp11] \n\t" + "punpckhwd %[ftmp1], %[ftmp7], %[ftmp11] \n\t" + "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + + "punpcklwd %[ftmp0], %[ftmp6], %[ftmp10] \n\t" + "punpckhwd %[ftmp1], %[ftmp6], %[ftmp10] \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + + "punpcklwd %[ftmp1], %[ftmp5], %[ftmp9] \n\t" + "punpckhwd %[ftmp0], %[ftmp5], %[ftmp9] \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + + "punpcklwd %[ftmp1], %[ftmp4], %[ftmp8] \n\t" + "punpckhwd %[ftmp0], %[ftmp4], %[ftmp8] \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "addiu %[count], %[count], -0x01 \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x03) + MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), + [tmp0]"=&r"(tmp[0]), [src_ptr]"+&r"(src_ptr), + [count]"+&r"(count) + : [limit]"r"(limit), [blimit]"r"(blimit), + [srct]"r"(srct), [thresh]"r"(thresh), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [ff_ph_003f]"f"(ff_ph_003f), [ff_ph_0900]"f"(ff_ph_0900), + [ff_pb_03]"f"(ff_pb_03), [ff_pb_04]"f"(ff_pb_04), + [ff_pb_80]"f"(ff_pb_80), [ff_pb_fe]"f"(ff_pb_fe) + : "memory" + ); +} + +#define VP8_SIMPLE_HPSRAB \ + "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" \ + "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" \ + "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" \ + "psrah %[ftmp1], %[ftmp5], %[ftmp10] \n\t" \ + "psllh %[ftmp1], %[ftmp1], %[ftmp8] \n\t" \ + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + +void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr, + int src_pixel_step, + const unsigned char *blimit) { + uint32_t tmp[1], count = 2; + mips_reg addr[2]; + double ftmp[12]; + + __asm__ volatile ( + "li %[tmp0], 0x08 \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" + "li %[tmp0], 0x03 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "li %[tmp0], 0x0b \n\t" + "mtc1 %[tmp0], %[ftmp10] \n\t" + "li %[tmp0], 0x01 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp3], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[blimit]) \n\t" + + MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step]) + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t" + "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t" + "pasubub %[ftmp1], %[ftmp7], %[ftmp2] \n\t" + "and %[ftmp1], %[ftmp1], %[ff_pb_fe] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp6], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + "pasubub %[ftmp5], %[ftmp6], %[ftmp0] \n\t" + "paddusb %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddusb %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "psubusb %[ftmp5], %[ftmp5], %[ftmp3] \n\t" + "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp3] \n\t" + + "xor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t" + "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + "psubsb %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "xor %[ftmp3], %[ftmp0], %[ff_pb_80] \n\t" + "psubsb %[ftmp0], %[ftmp3], %[ftmp6] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "and %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + + "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t" + VP8_SIMPLE_HPSRAB + "psubsb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "gssdlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t" + + "psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t" + VP8_SIMPLE_HPSRAB + "paddsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp6], 0x07(%[addr1]) \n\t" + "gssdrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + + "addiu %[count], %[count], -0x01 \n\t" + MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08) + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count) + : [blimit]"r"(blimit), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)), + [ff_pb_fe]"f"(ff_pb_fe), [ff_pb_80]"f"(ff_pb_80), + [ff_pb_04]"f"(ff_pb_04), [ff_pb_01]"f"(ff_pb_01) + : "memory" + ); +} + +void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, + int src_pixel_step, + const unsigned char *blimit) { + uint32_t tmp[1], count = 2; + mips_reg addr[2]; + DECLARE_ALIGNED(8, const uint64_t, srct[1]); + double ftmp[12]; + + __asm__ volatile ( + "li %[tmp0], 0x08 \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp10] \n\t" + + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x4]) + MMI_SUBU(%[src_ptr], %[src_ptr], 0x02) + + "1: \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step]) + MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2]) + "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gslwlc1 %[ftmp6], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + + MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + "gslwlc1 %[ftmp4], 0x03(%[src_ptr]) \n\t" + "gslwrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "punpckhhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t" + "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gslwlc1 %[ftmp7], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp7], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gslwlc1 %[ftmp6], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp7] \n\t" + + MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4]) + "gslwlc1 %[ftmp1], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp1], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4]) + "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "punpckhhw %[ftmp2], %[ftmp0], %[ftmp6] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp6] \n\t" + "punpckhwd %[ftmp1], %[ftmp0], %[ftmp4] \n\t" + "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "punpckhwd %[ftmp3], %[ftmp2], %[ftmp5] \n\t" + "punpcklwd %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + + "li %[tmp0], 0x01 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "pasubub %[ftmp6], %[ftmp3], %[ftmp0] \n\t" + "and %[ftmp6], %[ftmp6], %[ff_pb_fe] \n\t" + "psrlh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "pasubub %[ftmp5], %[ftmp1], %[ftmp2] \n\t" + "paddusb %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t" + + "gsldlc1 %[ftmp7], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[blimit]) \n\t" + "psubusb %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + + "sdc1 %[ftmp0], 0x00(%[srct]) \n\t" + "sdc1 %[ftmp3], 0x08(%[srct]) \n\t" + + "xor %[ftmp0], %[ftmp0], %[ff_pb_80] \n\t" + "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "psubsb %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + + "xor %[ftmp6], %[ftmp1], %[ff_pb_80] \n\t" + "xor %[ftmp3], %[ftmp2], %[ff_pb_80] \n\t" + "psubsb %[ftmp7], %[ftmp3], %[ftmp6] \n\t" + "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "and %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t" + + "li %[tmp0], 0x03 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" + + "li %[tmp0], 0x0b \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "psrah %[ftmp7], %[ftmp5], %[ftmp9] \n\t" + "psllh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psubsb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t" + + "li %[tmp0], 0x03 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" + + "li %[tmp0], 0x0b \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + "psllh %[ftmp5], %[ftmp5], %[ftmp8] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp5] \n\t" + "paddsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + + "ldc1 %[ftmp0], 0x00(%[srct]) \n\t" + "ldc1 %[ftmp4], 0x08(%[srct]) \n\t" + + "punpckhbh %[ftmp1], %[ftmp0], %[ftmp6] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp6] \n\t" + "punpcklbh %[ftmp2], %[ftmp3], %[ftmp4] \n\t" + "punpckhbh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + + "punpckhhw %[ftmp6], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4]) + "gsswlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + "punpckhhw %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + "punpcklhw %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + + "dsrl %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4]) + "gsswlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsswlc1 %[ftmp6], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + + "dsrl %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[src_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsswlc1 %[ftmp6], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + + MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsswlc1 %[ftmp5], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp5], 0x00(%[addr1]) \n\t" + + "dsrl %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[addr0]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + + "dsrl %[ftmp5], %[ftmp5], %[ftmp10] \n\t" + MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2]) + "gsswlc1 %[ftmp5], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp5], 0x00(%[addr1]) \n\t" + + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x8]) + "addiu %[count], %[count], -0x01 \n\t" + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count) + : [blimit]"r"(blimit), [srct]"r"(srct), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)), + [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)), + [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3)), + [ff_pb_fe]"f"(ff_pb_fe), [ff_pb_80]"f"(ff_pb_80), + [ff_pb_04]"f"(ff_pb_04), [ff_pb_01]"f"(ff_pb_01) + : "memory" + ); +} + +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_mmi(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_mbloop_filter_horizontal_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_mmi(u_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_horizontal_edge_mmi(v_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); +} + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_mmi(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_mbloop_filter_vertical_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_mmi(u_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_vertical_edge_mmi(v_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); +} + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_mmi(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_loop_filter_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_mmi(u_ptr + 4 * uv_stride, uv_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_horizontal_edge_mmi(v_ptr + 4 * uv_stride, uv_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_mmi(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_loop_filter_vertical_edge_mmi(y_ptr + 4, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmi(y_ptr + 8, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmi(y_ptr + 12, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_mmi(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_vertical_edge_mmi(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 1); +} + +void vp8_loop_filter_bhs_mmi(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride, + blimit); + vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride, + blimit); + vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride, + blimit); +} + +void vp8_loop_filter_bvs_mmi(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 12, y_stride, blimit); +} diff --git a/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c b/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c new file mode 100644 index 000000000..77d665d45 --- /dev/null +++ b/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c @@ -0,0 +1,416 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/common/filter.h" +#include "vpx_ports/asmdefs_mmi.h" + +DECLARE_ALIGNED(8, static const int16_t, vp8_six_tap_mmi[8][6 * 8]) = { + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, + 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, + 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, + 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, + 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, + 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, + 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, + 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, + 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, + 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, + 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, + 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, + 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, + 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, + 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, + 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, + 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, + 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, + 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, + 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, + 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, + 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 } +}; + +/* Horizontal filter: pixel_step is 1, output_height and output_width are + the size of horizontal filtering output, output_height is always H + 5 */ +static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, + uint16_t *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp8_filter) { + uint32_t tmp[1]; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; + +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); + register double ftmp2 asm("$f6"); + register double ftmp3 asm("$f8"); + register double ftmp4 asm("$f10"); + register double ftmp5 asm("$f12"); + register double ftmp6 asm("$f14"); + register double ftmp7 asm("$f16"); + register double ftmp8 asm("$f18"); + register double ftmp9 asm("$f20"); + register double ftmp10 asm("$f22"); + register double ftmp11 asm("$f24"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); + register double ftmp2 asm("$f3"); + register double ftmp3 asm("$f4"); + register double ftmp4 asm("$f5"); + register double ftmp5 asm("$f6"); + register double ftmp6 asm("$f7"); + register double ftmp7 asm("$f8"); + register double ftmp8 asm("$f9"); + register double ftmp9 asm("$f10"); + register double ftmp10 asm("$f11"); + register double ftmp11 asm("$f12"); +#endif // _MIPS_SIM == _ABIO32 + + __asm__ volatile ( + "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t" + "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t" + "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t" + "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t" + "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t" + "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t" + "xor %[fzero], %[fzero], %[fzero] \n\t" + "li %[tmp0], 0x07 \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" + "li %[tmp0], 0x08 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp9], 0x05(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp9], -0x02(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp10], 0x06(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp10], -0x01(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp6], %[ftmp9], %[fzero] \n\t" + "pmullh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" + + "punpckhbh %[ftmp6], %[ftmp9], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "punpckhbh %[ftmp6], %[ftmp10], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "dsrl %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "dsrl %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "paddsh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packushb %[ftmp8], %[ftmp8], %[fzero] \n\t" + "punpcklbh %[ftmp8], %[ftmp8], %[fzero] \n\t" + "gssdlc1 %[ftmp8], 0x07(%[output_ptr]) \n\t" + "gssdrc1 %[ftmp8], 0x00(%[output_ptr]) \n\t" + + "addiu %[output_height], %[output_height], -0x01 \n\t" + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_width]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixels_per_line]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), + [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4), + [ftmp5]"=&f"(ftmp5), [ftmp6]"=&f"(ftmp6), + [ftmp7]"=&f"(ftmp7), [ftmp8]"=&f"(ftmp8), + [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10), + [ftmp11]"=&f"(ftmp11), [tmp0]"=&r"(tmp[0]), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height), + [src_ptr]"+&r"(src_ptr) + : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line), + [vp8_filter]"r"(vp8_filter), [output_width]"r"(output_width), + [ff_ph_40]"f"(ff_ph_40) + : "memory" + ); +} + +/* Horizontal filter: pixel_step is always W */ +static INLINE void vp8_filter_block1dc_v6_mmi( + uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height, + int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) { + DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; + uint32_t tmp[1]; + mips_reg addr[1]; +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); + register double ftmp2 asm("$f6"); + register double ftmp3 asm("$f8"); + register double ftmp4 asm("$f10"); + register double ftmp5 asm("$f12"); + register double ftmp6 asm("$f14"); + register double ftmp7 asm("$f16"); + register double ftmp8 asm("$f18"); + register double ftmp9 asm("$f20"); + register double ftmp10 asm("$f22"); + register double ftmp11 asm("$f24"); + register double ftmp12 asm("$f26"); + register double ftmp13 asm("$f28"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); + register double ftmp2 asm("$f3"); + register double ftmp3 asm("$f4"); + register double ftmp4 asm("$f5"); + register double ftmp5 asm("$f6"); + register double ftmp6 asm("$f7"); + register double ftmp7 asm("$f8"); + register double ftmp8 asm("$f9"); + register double ftmp9 asm("$f10"); + register double ftmp10 asm("$f11"); + register double ftmp11 asm("$f12"); + register double ftmp12 asm("$f13"); + register double ftmp13 asm("$f14"); +#endif // _MIPS_SIM == _ABIO32 + + __asm__ volatile ( + "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t" + "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t" + "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t" + "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t" + "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t" + "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t" + "xor %[fzero], %[fzero], %[fzero] \n\t" + "li %[tmp0], 0x07 \n\t" + "mtc1 %[tmp0], %[ftmp13] \n\t" + + /* In order to make full use of memory load delay slot, + * Operation of memory loading and calculating has been rearranged. + */ + "1: \n\t" + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line]) + "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x2]) + "gsldlc1 %[ftmp8], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[addr0]) \n\t" + + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x4]) + "gsldlc1 %[ftmp9], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[addr0]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line]) + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x2]) + "gsldlc1 %[ftmp10], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[addr0]) \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x4]) + "gsldlc1 %[ftmp11], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr0]) \n\t" + + "pmullh %[ftmp12], %[ftmp6], %[ftmp0] \n\t" + + "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp7] \n\t" + + "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp8] \n\t" + + "pmullh %[ftmp9], %[ftmp9], %[ftmp4] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp9] \n\t" + + "pmullh %[ftmp10], %[ftmp10], %[ftmp3] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp10] \n\t" + + "pmullh %[ftmp11], %[ftmp11], %[ftmp5] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp11] \n\t" + + "paddsh %[ftmp12], %[ftmp12], %[ff_ph_40] \n\t" + "psrah %[ftmp12], %[ftmp12], %[ftmp13] \n\t" + "packushb %[ftmp12], %[ftmp12], %[fzero] \n\t" + "gsswlc1 %[ftmp12], 0x03(%[output_ptr]) \n\t" + "gsswrc1 %[ftmp12], 0x00(%[output_ptr]) \n\t" + + MMI_ADDIU(%[output_height], %[output_height], -0x01) + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), + [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4), + [ftmp5]"=&f"(ftmp5), [ftmp6]"=&f"(ftmp6), + [ftmp7]"=&f"(ftmp7), [ftmp8]"=&f"(ftmp8), + [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10), + [ftmp11]"=&f"(ftmp11), [ftmp12]"=&f"(ftmp12), + [ftmp13]"=&f"(ftmp13), [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [src_ptr]"+&r"(src_ptr), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height) + : [pixels_per_line]"r"((mips_reg)pixels_per_line), + [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)), + [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)), + [vp8_filter]"r"(vp8_filter), + [output_pitch]"r"((mips_reg)output_pitch), + [ff_ph_40]"f"(ff_ph_40) + : "memory" + ); +} + +/* When xoffset == 0, vp8_filter= {0,0,128,0,0,0}, + function vp8_filter_block1d_h6_mmi and vp8_filter_block1d_v6_mmi can + be simplified */ +static INLINE void vp8_filter_block1d_h6_filter0_mmi( + unsigned char *src_ptr, uint16_t *output_ptr, + unsigned int src_pixels_per_line, unsigned int output_height, + unsigned int output_width) { +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); +#endif // _MIPS_SIM == _ABIO32 + + __asm__ volatile ( + "xor %[fzero], %[fzero], %[fzero] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixels_per_line]) + + "punpcklbh %[ftmp1], %[ftmp0], %[fzero] \n\t" + "gssdlc1 %[ftmp1], 0x07(%[output_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[output_ptr]) \n\t" + + "addiu %[output_height], %[output_height], -0x01 \n\t" + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_width]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [src_ptr]"+&r"(src_ptr), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height) + : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line), + [output_width]"r"(output_width) + : "memory" + ); +} + +static INLINE void vp8_filter_block1dc_v6_filter0_mmi( + uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height, + int output_pitch, unsigned int pixels_per_line) { +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); +#endif // _MIPS_SIM == _ABIO32 + + __asm__ volatile ( + "xor %[fzero], %[fzero], %[fzero] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line]) + MMI_ADDIU(%[output_height], %[output_height], -0x01) + "packushb %[ftmp1], %[ftmp0], %[fzero] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[output_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[output_ptr]) \n\t" + + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [src_ptr]"+&r"(src_ptr), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height) + : [pixels_per_line]"r"((mips_reg)pixels_per_line), + [output_pitch]"r"((mips_reg)output_pitch) + : "memory" + ); +} + +#define sixtapNxM(n, m) \ + void vp8_sixtap_predict##n##x##m##_mmi( \ + unsigned char *src_ptr, int src_pixels_per_line, int xoffset, \ + int yoffset, unsigned char *dst_ptr, int dst_pitch) { \ + DECLARE_ALIGNED(16, uint16_t, \ + FData2[(n + 5) * (n == 16 ? 24 : (n == 8 ? 16 : n))]); \ + const int16_t *HFilter, *VFilter; \ + int i, loop = n / 4; \ + HFilter = vp8_six_tap_mmi[xoffset]; \ + VFilter = vp8_six_tap_mmi[yoffset]; \ + \ + if (xoffset == 0) { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1d_h6_filter0_mmi( \ + src_ptr - (2 * src_pixels_per_line) + i * 4, FData2 + i * 4, \ + src_pixels_per_line, m + 5, n * 2); \ + } \ + } else { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1d_h6_mmi(src_ptr - (2 * src_pixels_per_line) + i * 4, \ + FData2 + i * 4, src_pixels_per_line, m + 5, \ + n * 2, HFilter); \ + } \ + } \ + if (yoffset == 0) { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1dc_v6_filter0_mmi( \ + FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, dst_pitch, n * 2); \ + } \ + } else { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1dc_v6_mmi(FData2 + i * 4, dst_ptr + i * 4, m, \ + dst_pitch, n * 2, VFilter); \ + } \ + } \ + } + +sixtapNxM(4, 4); +sixtapNxM(8, 8); +sixtapNxM(8, 4); +sixtapNxM(16, 16); diff --git a/libvpx/vp8/common/onyxd.h b/libvpx/vp8/common/onyxd.h index cc2cb8089..d3c1b0e97 100644 --- a/libvpx/vp8/common/onyxd.h +++ b/libvpx/vp8/common/onyxd.h @@ -22,6 +22,7 @@ extern "C" { #include "vpx/vp8.h" struct VP8D_COMP; +struct VP8Common; typedef struct { int Width; @@ -45,6 +46,7 @@ int vp8dx_receive_compressed_data(struct VP8D_COMP *comp, size_t size, int vp8dx_get_raw_frame(struct VP8D_COMP *comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags); +int vp8dx_references_buffer(struct VP8Common *oci, int ref_frame); vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP *comp, enum vpx_ref_frame_type ref_frame_flag, diff --git a/libvpx/vp8/common/reconintra.c b/libvpx/vp8/common/reconintra.c index 986074ec7..8e2094da8 100644 --- a/libvpx/vp8/common/reconintra.c +++ b/libvpx/vp8/common/reconintra.c @@ -71,8 +71,16 @@ void vp8_build_intra_predictors_mbuv_s( unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char *upred_ptr, unsigned char *vpred_ptr, int pred_stride) { MB_PREDICTION_MODE uvmode = x->mode_info_context->mbmi.uv_mode; +#if HAVE_VSX + /* Power PC implementation uses "vec_vsx_ld" to read 16 bytes from + uleft_col and vleft_col. Play it safe by reserving enough stack + space here. */ + unsigned char uleft_col[16]; + unsigned char vleft_col[16]; +#else unsigned char uleft_col[8]; unsigned char vleft_col[8]; +#endif int i; intra_pred_fn fn; diff --git a/libvpx/vp8/common/reconintra4x4.c b/libvpx/vp8/common/reconintra4x4.c index 7852cf9da..64d33a287 100644 --- a/libvpx/vp8/common/reconintra4x4.c +++ b/libvpx/vp8/common/reconintra4x4.c @@ -40,7 +40,15 @@ void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft, int left_stride, B_PREDICTION_MODE b_mode, unsigned char *dst, int dst_stride, unsigned char top_left) { - unsigned char Aboveb[12], *Above = Aboveb + 4; +/* Power PC implementation uses "vec_vsx_ld" to read 16 bytes from + Above (aka, Aboveb + 4). Play it safe by reserving enough stack + space here. Similary for "Left". */ +#if HAVE_VSX + unsigned char Aboveb[20]; +#else + unsigned char Aboveb[12]; +#endif + unsigned char *Above = Aboveb + 4; #if HAVE_NEON // Neon intrinsics are unable to load 32 bits, or 4 8 bit values. Instead, it // over reads but does not use the extra 4 values. @@ -50,6 +58,8 @@ void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft, // indeed read, they are not used. vp8_zero_array(Left, 8); #endif // VPX_WITH_ASAN +#elif HAVE_VSX + unsigned char Left[16]; #else unsigned char Left[4]; #endif // HAVE_NEON diff --git a/libvpx/vp8/common/rtcd_defs.pl b/libvpx/vp8/common/rtcd_defs.pl index bc5e05799..3df745f75 100644 --- a/libvpx/vp8/common/rtcd_defs.pl +++ b/libvpx/vp8/common/rtcd_defs.pl @@ -1,3 +1,13 @@ +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + sub vp8_common_forward_decls() { print <<EOF /* @@ -22,67 +32,71 @@ forward_decls qw/vp8_common_forward_decls/; # Dequant # add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc"; -specialize qw/vp8_dequantize_b mmx neon msa/; +specialize qw/vp8_dequantize_b mmx neon msa mmi/; add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride"; -specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa/; +specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa mmi/; add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs"; -specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa/; +specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa mmi/; add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs"; -specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa/; +specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa mmi/; # # Loopfilter # add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; -specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa/; +specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa mmi/; add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; -specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa/; +specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa mmi/; add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; -specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa/; +specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa mmi/; add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; -specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa/; +specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa mmi/; add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit"; -specialize qw/vp8_loop_filter_simple_mbv sse2 neon msa/; +specialize qw/vp8_loop_filter_simple_mbv sse2 neon msa mmi/; $vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c; $vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2; $vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon; $vp8_loop_filter_simple_mbv_msa=vp8_loop_filter_simple_vertical_edge_msa; +$vp8_loop_filter_simple_mbv_mmi=vp8_loop_filter_simple_vertical_edge_mmi; add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit"; -specialize qw/vp8_loop_filter_simple_mbh sse2 neon msa/; +specialize qw/vp8_loop_filter_simple_mbh sse2 neon msa mmi/; $vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c; $vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2; $vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon; $vp8_loop_filter_simple_mbh_msa=vp8_loop_filter_simple_horizontal_edge_msa; +$vp8_loop_filter_simple_mbh_mmi=vp8_loop_filter_simple_horizontal_edge_mmi; add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit"; -specialize qw/vp8_loop_filter_simple_bv sse2 neon msa/; +specialize qw/vp8_loop_filter_simple_bv sse2 neon msa mmi/; $vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c; $vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2; $vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon; $vp8_loop_filter_simple_bv_msa=vp8_loop_filter_bvs_msa; +$vp8_loop_filter_simple_bv_mmi=vp8_loop_filter_bvs_mmi; add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit"; -specialize qw/vp8_loop_filter_simple_bh sse2 neon msa/; +specialize qw/vp8_loop_filter_simple_bh sse2 neon msa mmi/; $vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c; $vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2; $vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon; $vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa; +$vp8_loop_filter_simple_bh_mmi=vp8_loop_filter_bhs_mmi; # # IDCT # #idct16 add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride"; -specialize qw/vp8_short_idct4x4llm mmx neon dspr2 msa/; +specialize qw/vp8_short_idct4x4llm mmx neon dspr2 msa mmi/; #iwalsh1 add_proto qw/void vp8_short_inv_walsh4x4_1/, "short *input, short *output"; @@ -90,23 +104,23 @@ specialize qw/vp8_short_inv_walsh4x4_1 dspr2/; #iwalsh16 add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output"; -specialize qw/vp8_short_inv_walsh4x4 sse2 neon dspr2 msa/; +specialize qw/vp8_short_inv_walsh4x4 sse2 neon dspr2 msa mmi/; #idct1_scalar_add add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride"; -specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa/; +specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa mmi/; # # RECON # add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_copy_mem16x16 sse2 neon dspr2 msa/; +specialize qw/vp8_copy_mem16x16 sse2 neon dspr2 msa mmi/; add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa/; +specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa mmi/; add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa/; +specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa mmi/; # # Postproc @@ -132,16 +146,16 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes") { # Subpixel # add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa/; +specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi/; add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa/; +specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi/; add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa/; +specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa mmi/; add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa/; +specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa mmi/; add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; specialize qw/vp8_bilinear_predict16x16 sse2 ssse3 neon msa/; @@ -172,22 +186,22 @@ if ($opts{arch} =~ /x86/) { # Forward DCT # add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch"; -specialize qw/vp8_short_fdct4x4 sse2 neon msa/; +specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi/; add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch"; -specialize qw/vp8_short_fdct8x4 sse2 neon msa/; +specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi/; add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch"; -specialize qw/vp8_short_walsh4x4 sse2 neon msa/; +specialize qw/vp8_short_walsh4x4 sse2 neon msa mmi/; # # Quantizer # add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *"; -specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa/; +specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa mmi/; add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *"; -specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa/; +specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa mmi/; # # Block subtraction diff --git a/libvpx/vp8/common/threading.h b/libvpx/vp8/common/threading.h index ece64f3fb..b082bf109 100644 --- a/libvpx/vp8/common/threading.h +++ b/libvpx/vp8/common/threading.h @@ -191,47 +191,18 @@ static inline int sem_destroy(sem_t *sem) { #define x86_pause_hint() #endif -#if defined(__has_feature) -#if __has_feature(thread_sanitizer) -#define USE_MUTEX_LOCK 1 -#endif -#endif - #include "vpx_util/vpx_thread.h" +#include "vpx_util/vpx_atomics.h" -static INLINE int protected_read(pthread_mutex_t *const mutex, const int *p) { - (void)mutex; -#if defined(USE_MUTEX_LOCK) - int ret; - pthread_mutex_lock(mutex); - ret = *p; - pthread_mutex_unlock(mutex); - return ret; -#endif - return *p; -} - -static INLINE void sync_read(pthread_mutex_t *const mutex, int mb_col, - const int *last_row_current_mb_col, - const int nsync) { - while (mb_col > (protected_read(mutex, last_row_current_mb_col) - nsync)) { +static INLINE void vp8_atomic_spin_wait( + int mb_col, const vpx_atomic_int *last_row_current_mb_col, + const int nsync) { + while (mb_col > (vpx_atomic_load_acquire(last_row_current_mb_col) - nsync)) { x86_pause_hint(); thread_sleep(0); } } -static INLINE void protected_write(pthread_mutex_t *mutex, int *p, int v) { - (void)mutex; -#if defined(USE_MUTEX_LOCK) - pthread_mutex_lock(mutex); - *p = v; - pthread_mutex_unlock(mutex); - return; -#endif - *p = v; -} - -#undef USE_MUTEX_LOCK #endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */ #ifdef __cplusplus diff --git a/libvpx/vp8/common/vp8_loopfilter.c b/libvpx/vp8/common/vp8_loopfilter.c index c6430be46..9fb125065 100644 --- a/libvpx/vp8/common/vp8_loopfilter.c +++ b/libvpx/vp8/common/vp8_loopfilter.c @@ -111,11 +111,9 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm, MACROBLOCKD *mbd, /* Note the baseline filter values for each segment */ if (mbd->segmentation_enabled) { - /* Abs value */ if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) { lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; - } else /* Delta Value */ - { + } else { /* Delta Value */ lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; } lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63 : lvl_seg) : 0; @@ -344,8 +342,7 @@ void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int frame_type) { mode_info_context++; /* Skip border mb */ } - } else /* SIMPLE_LOOPFILTER */ - { + } else { /* SIMPLE_LOOPFILTER */ for (mb_row = 0; mb_row < mb_rows; ++mb_row) { for (mb_col = 0; mb_col < mb_cols; ++mb_col) { int skip_lf = (mode_info_context->mbmi.mode != B_PRED && diff --git a/libvpx/vp8/common/vp8_skin_detection.c b/libvpx/vp8/common/vp8_skin_detection.c new file mode 100644 index 000000000..6739efa5f --- /dev/null +++ b/libvpx/vp8/common/vp8_skin_detection.c @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/common/alloccommon.h" +#include "vp8/common/vp8_skin_detection.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_util/vpx_write_yuv_frame.h" + +static int avg_2x2(const uint8_t *s, int p) { + int i, j; + int sum = 0; + for (i = 0; i < 2; ++i, s += p) { + for (j = 0; j < 2; ++j) { + sum += s[j]; + } + } + return (sum + 2) >> 2; +} + +int vp8_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, + int stride, int strideuv, + SKIN_DETECTION_BLOCK_SIZE bsize, int consec_zeromv, + int curr_motion_magn) { + // No skin if block has been zero/small motion for long consecutive time. + if (consec_zeromv > 60 && curr_motion_magn == 0) { + return 0; + } else { + int motion = 1; + if (consec_zeromv > 25 && curr_motion_magn == 0) motion = 0; + if (bsize == SKIN_16X16) { + // Take the average of center 2x2 pixels. + const int ysource = avg_2x2(y + 7 * stride + 7, stride); + const int usource = avg_2x2(u + 3 * strideuv + 3, strideuv); + const int vsource = avg_2x2(v + 3 * strideuv + 3, strideuv); + return vpx_skin_pixel(ysource, usource, vsource, motion); + } else { + int num_skin = 0; + int i, j; + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + // Take the average of center 2x2 pixels. + const int ysource = avg_2x2(y + 3 * stride + 3, stride); + const int usource = avg_2x2(u + strideuv + 1, strideuv); + const int vsource = avg_2x2(v + strideuv + 1, strideuv); + num_skin += vpx_skin_pixel(ysource, usource, vsource, motion); + if (num_skin >= 2) return 1; + y += 8; + u += 4; + v += 4; + } + y += (stride << 3) - 16; + u += (strideuv << 2) - 8; + v += (strideuv << 2) - 8; + } + + return 0; + } + } +} + +#ifdef OUTPUT_YUV_SKINMAP +// For viewing skin map on input source. +void vp8_compute_skin_map(VP8_COMP *const cpi, FILE *yuv_skinmap_file) { + int i, j, mb_row, mb_col, num_bl; + VP8_COMMON *const cm = &cpi->common; + uint8_t *y; + const uint8_t *src_y = cpi->Source->y_buffer; + const int src_ystride = cpi->Source->y_stride; + int offset = 0; + + YV12_BUFFER_CONFIG skinmap; + memset(&skinmap, 0, sizeof(skinmap)); + if (vp8_yv12_alloc_frame_buffer(&skinmap, cm->Width, cm->Height, + VP8BORDERINPIXELS) < 0) { + vpx_free_frame_buffer(&skinmap); + return; + } + memset(skinmap.buffer_alloc, 128, skinmap.frame_size); + y = skinmap.y_buffer; + // Loop through blocks and set skin map based on center pixel of block. + // Set y to white for skin block, otherwise set to source with gray scale. + for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 1) { + num_bl = 0; + for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 1) { + const int is_skin = cpi->skin_map[offset++]; + for (i = 0; i < 16; i++) { + for (j = 0; j < 16; j++) { + y[i * src_ystride + j] = is_skin ? 255 : src_y[i * src_ystride + j]; + } + } + num_bl++; + y += 16; + src_y += 16; + } + y += (src_ystride << 4) - (num_bl << 4); + src_y += (src_ystride << 4) - (num_bl << 4); + } + vpx_write_yuv_frame(yuv_skinmap_file, &skinmap); + vpx_free_frame_buffer(&skinmap); +} +#endif // OUTPUT_YUV_SKINMAP diff --git a/libvpx/vp8/common/vp8_skin_detection.h b/libvpx/vp8/common/vp8_skin_detection.h new file mode 100644 index 000000000..4d27f5eb2 --- /dev/null +++ b/libvpx/vp8/common/vp8_skin_detection.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_COMMON_SKIN_DETECTION_H_ +#define VP8_COMMON_SKIN_DETECTION_H_ + +#include "vp8/encoder/onyx_int.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/skin_detection.h" +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8_COMP; + +typedef enum { + // Skin detection based on 8x8 block. If two of them are identified as skin, + // the macroblock is marked as skin. + SKIN_8X8, + // Skin detection based on 16x16 block. + SKIN_16X16 +} SKIN_DETECTION_BLOCK_SIZE; + +int vp8_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, + int stride, int strideuv, + SKIN_DETECTION_BLOCK_SIZE bsize, int consec_zeromv, + int curr_motion_magn); + +#ifdef OUTPUT_YUV_SKINMAP +// For viewing skin map on input source. +void vp8_compute_skin_map(struct VP8_COMP *const cpi, FILE *yuv_skinmap_file); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_SKIN_DETECTION_H_ diff --git a/libvpx/vp8/common/x86/copy_sse2.asm b/libvpx/vp8/common/x86/copy_sse2.asm index 86fae2695..480faa255 100644 --- a/libvpx/vp8/common/x86/copy_sse2.asm +++ b/libvpx/vp8/common/x86/copy_sse2.asm @@ -11,6 +11,7 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text ;void vp8_copy32xn_sse2( ; unsigned char *src_ptr, diff --git a/libvpx/vp8/common/x86/copy_sse3.asm b/libvpx/vp8/common/x86/copy_sse3.asm index d789a40cc..31ea898a3 100644 --- a/libvpx/vp8/common/x86/copy_sse3.asm +++ b/libvpx/vp8/common/x86/copy_sse3.asm @@ -83,6 +83,7 @@ ret %endmacro +SECTION .text ;void vp8_copy32xn_sse3( ; unsigned char *src_ptr, diff --git a/libvpx/vp8/common/x86/dequantize_mmx.asm b/libvpx/vp8/common/x86/dequantize_mmx.asm index 4e551f00a..bfdd99778 100644 --- a/libvpx/vp8/common/x86/dequantize_mmx.asm +++ b/libvpx/vp8/common/x86/dequantize_mmx.asm @@ -11,6 +11,7 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text ;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q) global sym(vp8_dequantize_b_impl_mmx) PRIVATE diff --git a/libvpx/vp8/common/x86/idctllm_mmx.asm b/libvpx/vp8/common/x86/idctllm_mmx.asm index 96fa2c60d..5773d9d84 100644 --- a/libvpx/vp8/common/x86/idctllm_mmx.asm +++ b/libvpx/vp8/common/x86/idctllm_mmx.asm @@ -31,6 +31,7 @@ ; * ; **************************************************************************/ +SECTION .text ;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, ;int pitch, unsigned char *dest,int stride) diff --git a/libvpx/vp8/common/x86/idctllm_sse2.asm b/libvpx/vp8/common/x86/idctllm_sse2.asm index bf8e2c402..560faba00 100644 --- a/libvpx/vp8/common/x86/idctllm_sse2.asm +++ b/libvpx/vp8/common/x86/idctllm_sse2.asm @@ -19,6 +19,8 @@ ; int dst_stride - 3 ; ) +SECTION .text + global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE sym(vp8_idct_dequant_0_2x_sse2): push rbp diff --git a/libvpx/vp8/common/x86/iwalsh_sse2.asm b/libvpx/vp8/common/x86/iwalsh_sse2.asm index 06e86a80b..82d7bf91a 100644 --- a/libvpx/vp8/common/x86/iwalsh_sse2.asm +++ b/libvpx/vp8/common/x86/iwalsh_sse2.asm @@ -11,6 +11,8 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;void vp8_short_inv_walsh4x4_sse2(short *input, short *output) global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE sym(vp8_short_inv_walsh4x4_sse2): diff --git a/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm b/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm index 6d5aaa19d..6a3d05290 100644 --- a/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm +++ b/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm @@ -125,6 +125,8 @@ pxor %1, [GLOBAL(t80)] %endmacro +SECTION .text + ;void vp8_loop_filter_bh_y_sse2 ;( ; unsigned char *src_ptr, diff --git a/libvpx/vp8/common/x86/loopfilter_sse2.asm b/libvpx/vp8/common/x86/loopfilter_sse2.asm index 1913abc69..2ae028fea 100644 --- a/libvpx/vp8/common/x86/loopfilter_sse2.asm +++ b/libvpx/vp8/common/x86/loopfilter_sse2.asm @@ -276,6 +276,8 @@ %endmacro +SECTION .text + %if ABI_IS_32BIT ;void vp8_loop_filter_horizontal_edge_sse2 diff --git a/libvpx/vp8/common/x86/mfqe_sse2.asm b/libvpx/vp8/common/x86/mfqe_sse2.asm index 8177b7922..3fde973ad 100644 --- a/libvpx/vp8/common/x86/mfqe_sse2.asm +++ b/libvpx/vp8/common/x86/mfqe_sse2.asm @@ -11,6 +11,8 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;void vp8_filter_by_weight16x16_sse2 ;( ; unsigned char *src, diff --git a/libvpx/vp8/common/x86/recon_mmx.asm b/libvpx/vp8/common/x86/recon_mmx.asm index 43f2dc6c6..e6a48f6b0 100644 --- a/libvpx/vp8/common/x86/recon_mmx.asm +++ b/libvpx/vp8/common/x86/recon_mmx.asm @@ -11,6 +11,7 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text ;void copy_mem8x8_mmx( ; unsigned char *src, diff --git a/libvpx/vp8/common/x86/recon_sse2.asm b/libvpx/vp8/common/x86/recon_sse2.asm index cb89537f7..57f8899c7 100644 --- a/libvpx/vp8/common/x86/recon_sse2.asm +++ b/libvpx/vp8/common/x86/recon_sse2.asm @@ -11,6 +11,8 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;void copy_mem16x16_sse2( ; unsigned char *src, ; int src_stride, diff --git a/libvpx/vp8/common/x86/subpixel_mmx.asm b/libvpx/vp8/common/x86/subpixel_mmx.asm index 6ab7f1fdc..1f3a2baca 100644 --- a/libvpx/vp8/common/x86/subpixel_mmx.asm +++ b/libvpx/vp8/common/x86/subpixel_mmx.asm @@ -17,6 +17,7 @@ extern sym(vp8_bilinear_filters_x86_8) %define vp8_filter_weight 128 %define VP8_FILTER_SHIFT 7 +SECTION .text ;void vp8_filter_block1d_h6_mmx ;( diff --git a/libvpx/vp8/common/x86/subpixel_sse2.asm b/libvpx/vp8/common/x86/subpixel_sse2.asm index ca00583ca..6e70f6d2e 100644 --- a/libvpx/vp8/common/x86/subpixel_sse2.asm +++ b/libvpx/vp8/common/x86/subpixel_sse2.asm @@ -16,6 +16,7 @@ extern sym(vp8_bilinear_filters_x86_8) %define VP8_FILTER_WEIGHT 128 %define VP8_FILTER_SHIFT 7 +SECTION .text ;/************************************************************************************ ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The diff --git a/libvpx/vp8/common/x86/subpixel_ssse3.asm b/libvpx/vp8/common/x86/subpixel_ssse3.asm index 1f6cbd1d1..8d55c9320 100644 --- a/libvpx/vp8/common/x86/subpixel_ssse3.asm +++ b/libvpx/vp8/common/x86/subpixel_ssse3.asm @@ -15,6 +15,7 @@ %define VP8_FILTER_WEIGHT 128 %define VP8_FILTER_SHIFT 7 +SECTION .text ;/************************************************************************************ ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The diff --git a/libvpx/vp8/decoder/decodeframe.c b/libvpx/vp8/decoder/decodeframe.c index 0aec2a01b..077bd3da2 100644 --- a/libvpx/vp8/decoder/decodeframe.c +++ b/libvpx/vp8/decoder/decodeframe.c @@ -930,7 +930,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) { /* When error concealment is enabled we should only check the sync * code if we have enough bits available */ - if (!pbi->ec_active || data + 3 < data_end) { + if (data + 3 < data_end) { if (clear[0] != 0x9d || clear[1] != 0x01 || clear[2] != 0x2a) { vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM, "Invalid frame sync code"); @@ -941,13 +941,19 @@ int vp8_decode_frame(VP8D_COMP *pbi) { * if we have enough data. Otherwise we will end up with the wrong * size. */ - if (!pbi->ec_active || data + 6 < data_end) { + if (data + 6 < data_end) { pc->Width = (clear[3] | (clear[4] << 8)) & 0x3fff; pc->horiz_scale = clear[4] >> 6; pc->Height = (clear[5] | (clear[6] << 8)) & 0x3fff; pc->vert_scale = clear[6] >> 6; + data += 7; + } else if (!pbi->ec_active) { + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Truncated key frame header"); + } else { + /* Error concealment is active, clear the frame. */ + data = data_end; } - data += 7; } else { memcpy(&xd->pre, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG)); memcpy(&xd->dst, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG)); @@ -1199,7 +1205,8 @@ int vp8_decode_frame(VP8D_COMP *pbi) { pbi->frame_corrupt_residual = 0; #if CONFIG_MULTITHREAD - if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) && + pc->multi_token_partition != ONE_PARTITION) { unsigned int thread; vp8mt_decode_mb_rows(pbi, xd); vp8_yv12_extend_frame_borders(yv12_fb_new); diff --git a/libvpx/vp8/decoder/decodemv.c b/libvpx/vp8/decoder/decodemv.c index b946ab73d..8e9600c6d 100644 --- a/libvpx/vp8/decoder/decodemv.c +++ b/libvpx/vp8/decoder/decodemv.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "decodemv.h" #include "treereader.h" #include "vp8/common/entropymv.h" #include "vp8/common/entropymode.h" @@ -64,8 +65,7 @@ static int read_mvcomponent(vp8_reader *r, const MV_CONTEXT *mvc) { const vp8_prob *const p = (const vp8_prob *)mvc; int x = 0; - if (vp8_read(r, p[mvpis_short])) /* Large */ - { + if (vp8_read(r, p[mvpis_short])) { /* Large */ int i = 0; do { @@ -284,8 +284,7 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi) { vp8_reader *const bc = &pbi->mbc[8]; mbmi->ref_frame = (MV_REFERENCE_FRAME)vp8_read(bc, pbi->prob_intra); - if (mbmi->ref_frame) /* inter MB */ - { + if (mbmi->ref_frame) { /* inter MB */ enum { CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV }; int cnt[4]; int *cntx = cnt; diff --git a/libvpx/vp8/decoder/onyxd_if.c b/libvpx/vp8/decoder/onyxd_if.c index 789c2eeff..f516eb0c7 100644 --- a/libvpx/vp8/decoder/onyxd_if.c +++ b/libvpx/vp8/decoder/onyxd_if.c @@ -41,7 +41,6 @@ #endif extern void vp8_init_loop_filter(VP8_COMMON *cm); -extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi); static int get_free_fb(VP8_COMMON *cm); static void ref_cnt_fb(int *buf, int *idx, int new_idx); diff --git a/libvpx/vp8/decoder/onyxd_int.h b/libvpx/vp8/decoder/onyxd_int.h index 88b1ff16b..5ecacdbb9 100644 --- a/libvpx/vp8/decoder/onyxd_int.h +++ b/libvpx/vp8/decoder/onyxd_int.h @@ -68,7 +68,7 @@ typedef struct VP8D_COMP { #if CONFIG_MULTITHREAD /* variable for threading */ - int b_multithreaded_rd; + vpx_atomic_int b_multithreaded_rd; int max_threads; int current_mb_col_main; unsigned int decoding_thread_count; @@ -76,9 +76,8 @@ typedef struct VP8D_COMP { int mt_baseline_filter_level[MAX_MB_SEGMENTS]; int sync_range; - int *mt_current_mb_col; /* Each row remembers its already decoded column. */ - pthread_mutex_t *pmutex; - pthread_mutex_t mt_mutex; /* mutex for b_multithreaded_rd */ + /* Each row remembers its already decoded column. */ + vpx_atomic_int *mt_current_mb_col; unsigned char **mt_yabove_row; /* mb_rows x width */ unsigned char **mt_uabove_row; @@ -119,6 +118,8 @@ typedef struct VP8D_COMP { void *decrypt_state; } VP8D_COMP; +void vp8cx_init_de_quantizer(VP8D_COMP *pbi); +void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd); int vp8_decode_frame(VP8D_COMP *cpi); int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf); diff --git a/libvpx/vp8/decoder/threading.c b/libvpx/vp8/decoder/threading.c index 9f7751988..d0213f75c 100644 --- a/libvpx/vp8/decoder/threading.c +++ b/libvpx/vp8/decoder/threading.c @@ -20,6 +20,7 @@ #include "vp8/common/loopfilter.h" #include "vp8/common/extend.h" #include "vpx_ports/vpx_timer.h" +#include "decoderthreading.h" #include "detokenize.h" #include "vp8/common/reconintra4x4.h" #include "vp8/common/reconinter.h" @@ -36,8 +37,6 @@ memset((p), 0, (n) * sizeof(*(p))); \ } while (0) -void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd); - static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count) { VP8_COMMON *const pc = &pbi->common; @@ -80,7 +79,8 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, if (pc->full_pixel) mbd->fullpixel_mask = 0xfffffff8; } - for (i = 0; i < pc->mb_rows; ++i) pbi->mt_current_mb_col[i] = -1; + for (i = 0; i < pc->mb_rows; ++i) + vpx_atomic_store_release(&pbi->mt_current_mb_col[i], -1); } static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, @@ -248,12 +248,13 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row) { - const int *last_row_current_mb_col; - int *current_mb_col; + const vpx_atomic_int *last_row_current_mb_col; + vpx_atomic_int *current_mb_col; int mb_row; VP8_COMMON *pc = &pbi->common; const int nsync = pbi->sync_range; - const int first_row_no_sync_above = pc->mb_cols + nsync; + const vpx_atomic_int first_row_no_sync_above = + VPX_ATOMIC_INIT(pc->mb_cols + nsync); int num_part = 1 << pbi->common.multi_token_partition; int last_mb_row = start_mb_row; @@ -357,13 +358,11 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, for (mb_col = 0; mb_col < pc->mb_cols; ++mb_col) { if (((mb_col - 1) % nsync) == 0) { - pthread_mutex_t *mutex = &pbi->pmutex[mb_row]; - protected_write(mutex, current_mb_col, mb_col - 1); + vpx_atomic_store_release(current_mb_col, mb_col - 1); } if (mb_row && !(mb_col & (nsync - 1))) { - pthread_mutex_t *mutex = &pbi->pmutex[mb_row - 1]; - sync_read(mutex, mb_col, last_row_current_mb_col, nsync); + vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync); } /* Distance of MB to the various image edges. @@ -549,7 +548,7 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, } /* last MB of row is ready just after extension is done */ - protected_write(&pbi->pmutex[mb_row], current_mb_col, mb_col + nsync); + vpx_atomic_store_release(current_mb_col, mb_col + nsync); ++xd->mode_info_context; /* skip prediction column */ xd->up_available = 1; @@ -569,10 +568,10 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) { ENTROPY_CONTEXT_PLANES mb_row_left_context; while (1) { - if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0) break; + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) break; if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) { - if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) { break; } else { MACROBLOCKD *xd = &mbrd->mbd; @@ -590,9 +589,8 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { int core_count = 0; unsigned int ithread; - pbi->b_multithreaded_rd = 0; + vpx_atomic_init(&pbi->b_multithreaded_rd, 0); pbi->allocated_decoding_thread_count = 0; - pthread_mutex_init(&pbi->mt_mutex, NULL); /* limit decoding threads to the max number of token partitions */ core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads; @@ -603,7 +601,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { } if (core_count > 1) { - pbi->b_multithreaded_rd = 1; + vpx_atomic_init(&pbi->b_multithreaded_rd, 1); pbi->decoding_thread_count = core_count - 1; CALLOC_ARRAY(pbi->h_decoding_thread, pbi->decoding_thread_count); @@ -649,16 +647,6 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows) { int i; - /* De-allocate mutex */ - if (pbi->pmutex != NULL) { - for (i = 0; i < mb_rows; ++i) { - pthread_mutex_destroy(&pbi->pmutex[i]); - } - - vpx_free(pbi->pmutex); - pbi->pmutex = NULL; - } - vpx_free(pbi->mt_current_mb_col); pbi->mt_current_mb_col = NULL; @@ -724,7 +712,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { int i; int uv_width; - if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows); /* our internal buffers are always multiples of 16 */ @@ -742,36 +730,33 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { uv_width = width >> 1; - /* Allocate mutex */ - CHECK_MEM_ERROR(pbi->pmutex, - vpx_malloc(sizeof(*pbi->pmutex) * pc->mb_rows)); - if (pbi->pmutex) { - for (i = 0; i < pc->mb_rows; ++i) { - pthread_mutex_init(&pbi->pmutex[i], NULL); - } - } - - /* Allocate an int for each mb row. */ - CALLOC_ARRAY(pbi->mt_current_mb_col, pc->mb_rows); + /* Allocate a vpx_atomic_int for each mb row. */ + CHECK_MEM_ERROR(pbi->mt_current_mb_col, + vpx_malloc(sizeof(*pbi->mt_current_mb_col) * pc->mb_rows)); + for (i = 0; i < pc->mb_rows; ++i) + vpx_atomic_init(&pbi->mt_current_mb_col[i], 0); /* Allocate memory for above_row buffers. */ CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR(pbi->mt_yabove_row[i], - vpx_memalign(16, sizeof(unsigned char) * - (width + (VP8BORDERINPIXELS << 1)))); + CHECK_MEM_ERROR( + pbi->mt_yabove_row[i], + vpx_memalign( + 16, sizeof(unsigned char) * (width + (VP8BORDERINPIXELS << 1)))); CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR(pbi->mt_uabove_row[i], - vpx_memalign(16, sizeof(unsigned char) * - (uv_width + VP8BORDERINPIXELS))); + CHECK_MEM_ERROR( + pbi->mt_uabove_row[i], + vpx_memalign(16, + sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS))); CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR(pbi->mt_vabove_row[i], - vpx_memalign(16, sizeof(unsigned char) * - (uv_width + VP8BORDERINPIXELS))); + CHECK_MEM_ERROR( + pbi->mt_vabove_row[i], + vpx_memalign(16, + sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS))); /* Allocate memory for left_col buffers. */ CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows); @@ -793,9 +778,9 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { void vp8_decoder_remove_threads(VP8D_COMP *pbi) { /* shutdown MB Decoding thread; */ - if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { int i; - protected_write(&pbi->mt_mutex, &pbi->b_multithreaded_rd, 0); + vpx_atomic_store_release(&pbi->b_multithreaded_rd, 0); /* allow all threads to exit */ for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) { @@ -825,7 +810,6 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi) { vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows); } - pthread_mutex_destroy(&pbi->mt_mutex); } void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) { diff --git a/libvpx/vp8/encoder/bitstream.c b/libvpx/vp8/encoder/bitstream.c index 7086faae9..8cacb6450 100644 --- a/libvpx/vp8/encoder/bitstream.c +++ b/libvpx/vp8/encoder/bitstream.c @@ -500,8 +500,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) { } write_uv_mode(w, mi->uv_mode, pc->fc.uv_mode_prob); - } else /* inter coded */ - { + } else { /* inter coded */ int_mv best_mv; vp8_prob mv_ref_p[VP8_MVREFS - 1]; @@ -1416,7 +1415,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, vp8_start_encode(&cpi->bc[1], cx_data, cx_data_end); #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { pack_mb_row_tokens(cpi, &cpi->bc[1]); } else { vp8_pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count); diff --git a/libvpx/vp8/encoder/bitstream.h b/libvpx/vp8/encoder/bitstream.h index 2b196dcd2..ed45bff9e 100644 --- a/libvpx/vp8/encoder/bitstream.h +++ b/libvpx/vp8/encoder/bitstream.h @@ -15,7 +15,15 @@ extern "C" { #endif +#include "vp8/encoder/treewriter.h" +#include "vp8/encoder/tokenize.h" + void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount); +void vp8_convert_rfct_to_prob(struct VP8_COMP *const cpi); +void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_intra, + int prob_last, int prob_garf); +int vp8_estimate_entropy_savings(struct VP8_COMP *cpi); +void vp8_update_coef_probs(struct VP8_COMP *cpi); #ifdef __cplusplus } // extern "C" diff --git a/libvpx/vp8/encoder/encodeframe.c b/libvpx/vp8/encoder/encodeframe.c index c7ad3bfe2..9bb0df72d 100644 --- a/libvpx/vp8/encoder/encodeframe.c +++ b/libvpx/vp8/encoder/encodeframe.c @@ -11,8 +11,12 @@ #include "vpx_config.h" #include "vp8_rtcd.h" #include "./vpx_dsp_rtcd.h" +#include "bitstream.h" #include "encodemb.h" #include "encodemv.h" +#if CONFIG_MULTITHREAD +#include "ethreading.h" +#endif #include "vp8/common/common.h" #include "onyx_int.h" #include "vp8/common/extend.h" @@ -35,13 +39,6 @@ #include "encodeframe.h" extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t); -extern void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_intra, - int prob_last, int prob_garf); -extern void vp8_convert_rfct_to_prob(VP8_COMP *const cpi); -extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex); -extern void vp8_auto_select_speed(VP8_COMP *cpi); -extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x, - MB_ROW_COMP *mbr_ei, int count); static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x); #ifdef MODE_STATS @@ -344,11 +341,11 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row, #if CONFIG_MULTITHREAD const int nsync = cpi->mt_sync_range; - const int rightmost_col = cm->mb_cols + nsync; - const int *last_row_current_mb_col; - int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; + vpx_atomic_int rightmost_col = VPX_ATOMIC_INIT(cm->mb_cols + nsync); + const vpx_atomic_int *last_row_current_mb_col; + vpx_atomic_int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; - if ((cpi->b_multi_threaded != 0) && (mb_row != 0)) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0 && mb_row != 0) { last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1]; } else { last_row_current_mb_col = &rightmost_col; @@ -418,15 +415,13 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row, vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16); #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded != 0) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) { if (((mb_col - 1) % nsync) == 0) { - pthread_mutex_t *mutex = &cpi->pmutex[mb_row]; - protected_write(mutex, current_mb_col, mb_col - 1); + vpx_atomic_store_release(current_mb_col, mb_col - 1); } if (mb_row && !(mb_col & (nsync - 1))) { - pthread_mutex_t *mutex = &cpi->pmutex[mb_row - 1]; - sync_read(mutex, mb_col, last_row_current_mb_col, nsync); + vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync); } } #endif @@ -566,8 +561,9 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8); #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded != 0) { - protected_write(&cpi->pmutex[mb_row], current_mb_col, rightmost_col); + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) { + vpx_atomic_store_release(current_mb_col, + vpx_atomic_load_acquire(&rightmost_col)); } #endif @@ -752,13 +748,14 @@ void vp8_encode_frame(VP8_COMP *cpi) { vpx_usec_timer_start(&emr_timer); #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { int i; vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, cpi->encoding_thread_count); - for (i = 0; i < cm->mb_rows; ++i) cpi->mt_current_mb_col[i] = -1; + for (i = 0; i < cm->mb_rows; ++i) + vpx_atomic_store_release(&cpi->mt_current_mb_col[i], -1); for (i = 0; i < cpi->encoding_thread_count; ++i) { sem_post(&cpi->h_event_start_encoding[i]); diff --git a/libvpx/vp8/encoder/encodeframe.h b/libvpx/vp8/encoder/encodeframe.h index c1d863492..5274aba41 100644 --- a/libvpx/vp8/encoder/encodeframe.h +++ b/libvpx/vp8/encoder/encodeframe.h @@ -10,24 +10,29 @@ #ifndef VP8_ENCODER_ENCODEFRAME_H_ #define VP8_ENCODER_ENCODEFRAME_H_ +#include "vp8/encoder/tokenize.h" + #ifdef __cplusplus extern "C" { #endif -extern void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x); -extern void vp8_build_block_offsets(MACROBLOCK *x); +struct VP8_COMP; +struct macroblock; + +void vp8_activity_masking(struct VP8_COMP *cpi, MACROBLOCK *x); + +void vp8_build_block_offsets(struct macroblock *x); -extern void vp8_setup_block_ptrs(MACROBLOCK *x); +void vp8_setup_block_ptrs(struct macroblock *x); -extern void vp8_encode_frame(VP8_COMP *cpi); +void vp8_encode_frame(struct VP8_COMP *cpi); -extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, - TOKENEXTRA **t, int recon_yoffset, - int recon_uvoffset, int mb_row, - int mb_col); +int vp8cx_encode_inter_macroblock(struct VP8_COMP *cpi, struct macroblock *x, + TOKENEXTRA **t, int recon_yoffset, + int recon_uvoffset, int mb_row, int mb_col); -extern int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x, - TOKENEXTRA **t); +int vp8cx_encode_intra_macroblock(struct VP8_COMP *cpi, struct macroblock *x, + TOKENEXTRA **t); #ifdef __cplusplus } // extern "C" #endif diff --git a/libvpx/vp8/encoder/encodemv.c b/libvpx/vp8/encoder/encodemv.c index 36e9a9078..ea93ccd71 100644 --- a/libvpx/vp8/encoder/encodemv.c +++ b/libvpx/vp8/encoder/encodemv.c @@ -25,14 +25,12 @@ static void encode_mvcomponent(vp8_writer *const w, const int v, const vp8_prob *p = mvc->prob; const int x = v < 0 ? -v : v; - if (x < mvnum_short) /* Small */ - { + if (x < mvnum_short) { /* Small */ vp8_write(w, 0, p[mvpis_short]); vp8_treed_write(w, vp8_small_mvtree, p + MVPshort, x, 3); if (!x) return; /* no sign bit */ - } else /* Large */ - { + } else { /* Large */ int i = 0; vp8_write(w, 1, p[mvpis_short]); diff --git a/libvpx/vp8/encoder/ethreading.c b/libvpx/vp8/encoder/ethreading.c index df34997ac..55a1528b1 100644 --- a/libvpx/vp8/encoder/ethreading.c +++ b/libvpx/vp8/encoder/ethreading.c @@ -14,6 +14,7 @@ #include "vp8/common/extend.h" #include "bitstream.h" #include "encodeframe.h" +#include "ethreading.h" #if CONFIG_MULTITHREAD @@ -25,11 +26,11 @@ static THREAD_FUNCTION thread_loopfilter(void *p_data) { VP8_COMMON *cm = &cpi->common; while (1) { - if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break; + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; if (sem_wait(&cpi->h_event_start_lpf) == 0) { /* we're shutting down */ - if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break; + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; vp8_loopfilter_frame(cpi, cm); @@ -47,7 +48,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { ENTROPY_CONTEXT_PLANES mb_row_left_context; while (1) { - if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break; + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) { const int nsync = cpi->mt_sync_range; @@ -65,7 +66,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { int *totalrate = &mbri->totalrate; /* we're shutting down */ - if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break; + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; xd->mode_info_context = cm->mi + cm->mode_info_stride * (ithread + 1); xd->mode_info_stride = cm->mode_info_stride; @@ -79,8 +80,8 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride; int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride; int map_index = (mb_row * cm->mb_cols); - const int *last_row_current_mb_col; - int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; + const vpx_atomic_int *last_row_current_mb_col; + vpx_atomic_int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) vp8_writer *w = &cpi->bc[1 + (mb_row % num_part)]; @@ -107,13 +108,11 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { /* for each macroblock col in image */ for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { if (((mb_col - 1) % nsync) == 0) { - pthread_mutex_t *mutex = &cpi->pmutex[mb_row]; - protected_write(mutex, current_mb_col, mb_col - 1); + vpx_atomic_store_release(current_mb_col, mb_col - 1); } if (mb_row && !(mb_col & (nsync - 1))) { - pthread_mutex_t *mutex = &cpi->pmutex[mb_row - 1]; - sync_read(mutex, mb_col, last_row_current_mb_col, nsync); + vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync); } #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING @@ -285,7 +284,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { vp8_extend_mb_row(&cm->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8); - protected_write(&cpi->pmutex[mb_row], current_mb_col, mb_col + nsync); + vpx_atomic_store_release(current_mb_col, mb_col + nsync); /* this is to account for the border */ xd->mode_info_context++; @@ -489,12 +488,10 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x, int vp8cx_create_encoder_threads(VP8_COMP *cpi) { const VP8_COMMON *cm = &cpi->common; - cpi->b_multi_threaded = 0; + vpx_atomic_init(&cpi->b_multi_threaded, 0); cpi->encoding_thread_count = 0; cpi->b_lpf_running = 0; - pthread_mutex_init(&cpi->mt_mutex, NULL); - if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) { int ithread; int th_count = cpi->oxcf.multi_threaded - 1; @@ -525,7 +522,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { CHECK_MEM_ERROR(cpi->en_thread_data, vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count)); - cpi->b_multi_threaded = 1; + vpx_atomic_store_release(&cpi->b_multi_threaded, 1); cpi->encoding_thread_count = th_count; /* @@ -554,7 +551,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { if (rc) { /* shutdown other threads */ - protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0); + vpx_atomic_store_release(&cpi->b_multi_threaded, 0); for (--ithread; ithread >= 0; ithread--) { pthread_join(cpi->h_encoding_thread[ithread], 0); sem_destroy(&cpi->h_event_start_encoding[ithread]); @@ -568,8 +565,6 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { vpx_free(cpi->mb_row_ei); vpx_free(cpi->en_thread_data); - pthread_mutex_destroy(&cpi->mt_mutex); - return -1; } @@ -584,7 +579,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { if (rc) { /* shutdown other threads */ - protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0); + vpx_atomic_store_release(&cpi->b_multi_threaded, 0); for (--ithread; ithread >= 0; ithread--) { sem_post(&cpi->h_event_start_encoding[ithread]); sem_post(&cpi->h_event_end_encoding[ithread]); @@ -602,8 +597,6 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { vpx_free(cpi->mb_row_ei); vpx_free(cpi->en_thread_data); - pthread_mutex_destroy(&cpi->mt_mutex); - return -2; } } @@ -612,9 +605,9 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { } void vp8cx_remove_encoder_threads(VP8_COMP *cpi) { - if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded)) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { /* shutdown other threads */ - protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0); + vpx_atomic_store_release(&cpi->b_multi_threaded, 0); { int i; @@ -642,6 +635,5 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) { vpx_free(cpi->mb_row_ei); vpx_free(cpi->en_thread_data); } - pthread_mutex_destroy(&cpi->mt_mutex); } #endif diff --git a/libvpx/vp8/encoder/ethreading.h b/libvpx/vp8/encoder/ethreading.h new file mode 100644 index 000000000..95bf73d18 --- /dev/null +++ b/libvpx/vp8/encoder/ethreading.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_ENCODER_ETHREADING_H_ +#define VP8_ENCODER_ETHREADING_H_ + +#include "vp8/encoder/onyx_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8_COMP; +struct macroblock; + +void vp8cx_init_mbrthread_data(struct VP8_COMP *cpi, struct macroblock *x, + MB_ROW_COMP *mbr_ei, int count); +int vp8cx_create_encoder_threads(struct VP8_COMP *cpi); +void vp8cx_remove_encoder_threads(struct VP8_COMP *cpi); + +#ifdef __cplusplus +} +#endif + +#endif // VP8_ENCODER_ETHREADING_H_ diff --git a/libvpx/vp8/encoder/firstpass.c b/libvpx/vp8/encoder/firstpass.c index caf19059e..70f924341 100644 --- a/libvpx/vp8/encoder/firstpass.c +++ b/libvpx/vp8/encoder/firstpass.c @@ -1273,8 +1273,9 @@ void vp8_init_second_pass(VP8_COMP *cpi) { * sum duration is not. Its calculated based on the actual durations of * all frames from the first pass. */ - vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count / - cpi->twopass.total_stats.duration); + vp8_new_framerate(cpi, + 10000000.0 * cpi->twopass.total_stats.count / + cpi->twopass.total_stats.duration); cpi->output_framerate = cpi->framerate; cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * diff --git a/libvpx/vp8/encoder/mcomp.c b/libvpx/vp8/encoder/mcomp.c index b4a49a3b1..970120f3b 100644 --- a/libvpx/vp8/encoder/mcomp.c +++ b/libvpx/vp8/encoder/mcomp.c @@ -34,22 +34,19 @@ int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight) { * NEAREST for subsequent blocks. The "Weight" parameter allows, to a * limited extent, for some account to be taken of these factors. */ - const int mv_idx_row = - clamp((mv->as_mv.row - ref->as_mv.row) >> 1, 0, MVvals); - const int mv_idx_col = - clamp((mv->as_mv.col - ref->as_mv.col) >> 1, 0, MVvals); - return ((mvcost[0][mv_idx_row] + mvcost[1][mv_idx_col]) * Weight) >> 7; + return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] + + mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1]) * + Weight) >> + 7; } static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int error_per_bit) { /* Ignore mv costing if mvcost is NULL */ if (mvcost) { - const int mv_idx_row = - clamp((mv->as_mv.row - ref->as_mv.row) >> 1, 0, MVvals); - const int mv_idx_col = - clamp((mv->as_mv.col - ref->as_mv.col) >> 1, 0, MVvals); - return ((mvcost[0][mv_idx_row] + mvcost[1][mv_idx_col]) * error_per_bit + + return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] + + mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1]) * + error_per_bit + 128) >> 8; } diff --git a/libvpx/vp8/encoder/mips/mmi/dct_mmi.c b/libvpx/vp8/encoder/mips/mmi/dct_mmi.c new file mode 100644 index 000000000..1f60a692d --- /dev/null +++ b/libvpx/vp8/encoder/mips/mmi/dct_mmi.c @@ -0,0 +1,425 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/asmdefs_mmi.h" + +/* clang-format off */ +/* TRANSPOSE_4H: transpose 4x4 matrix. + Input: ftmp1,ftmp2,ftmp3,ftmp4 + Output: ftmp1,ftmp2,ftmp3,ftmp4 + Note: ftmp0 always be 0, ftmp5~9 used for temporary value. + */ +#define TRANSPOSE_4H \ + MMI_LI(%[tmp0], 0x93) \ + "mtc1 %[tmp0], %[ftmp10] \n\t" \ + "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "or %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \ + "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "or %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \ + "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "or %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \ + "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "or %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \ + "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \ + "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \ + "punpckhwd %[ftmp4], %[ftmp6], %[ftmp8] \n\t" +/* clang-format on */ + +void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { + uint64_t tmp[1]; + int16_t *ip = input; + +#if _MIPS_SIM == _ABIO32 + register double ftmp0 asm("$f0"); + register double ftmp1 asm("$f2"); + register double ftmp2 asm("$f4"); + register double ftmp3 asm("$f6"); + register double ftmp4 asm("$f8"); + register double ftmp5 asm("$f10"); + register double ftmp6 asm("$f12"); + register double ftmp7 asm("$f14"); + register double ftmp8 asm("$f16"); + register double ftmp9 asm("$f18"); + register double ftmp10 asm("$f20"); + register double ftmp11 asm("$f22"); + register double ftmp12 asm("$f24"); +#else + register double ftmp0 asm("$f0"); + register double ftmp1 asm("$f1"); + register double ftmp2 asm("$f2"); + register double ftmp3 asm("$f3"); + register double ftmp4 asm("$f4"); + register double ftmp5 asm("$f5"); + register double ftmp6 asm("$f6"); + register double ftmp7 asm("$f7"); + register double ftmp8 asm("$f8"); + register double ftmp9 asm("$f9"); + register double ftmp10 asm("$f10"); + register double ftmp11 asm("$f11"); + register double ftmp12 asm("$f12"); +#endif // _MIPS_SIM == _ABIO32 + + DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_07) = { 0x0007000700070007ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_12000) = { 0x00002ee000002ee0ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL }; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + TRANSPOSE_4H + + "ldc1 %[ftmp11], %[ff_ph_8] \n\t" + // f1 + f4 + "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t" + // a1 + "pmullh %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + // f2 + f3 + "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + // b1 + "pmullh %[ftmp6], %[ftmp6], %[ftmp11] \n\t" + // f2 - f3 + "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t" + // c1 + "pmullh %[ftmp7], %[ftmp7], %[ftmp11] \n\t" + // f1 - f4 + "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t" + // d1 + "pmullh %[ftmp8], %[ftmp8], %[ftmp11] \n\t" + // op[0] = a1 + b1 + "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + // op[2] = a1 - b1 + "psubh %[ftmp3], %[ftmp5], %[ftmp6] \n\t" + + // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12 + MMI_LI(%[tmp0], 0x0c) + "mtc1 %[tmp0], %[ftmp11] \n\t" + "ldc1 %[ftmp12], %[ff_pw_14500] \n\t" + "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" + "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op1] \n\t" + "punpckhhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" + "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op1] \n\t" + "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t" + "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t" + "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t" + "packsswh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + + // op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12 + "ldc1 %[ftmp12], %[ff_pw_7500] \n\t" + "punpcklhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t" + "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op3] \n\t" + "punpckhhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t" + "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op3] \n\t" + "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t" + "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t" + "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t" + "packsswh %[ftmp4], %[ftmp5], %[ftmp6] \n\t" + TRANSPOSE_4H + + "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t" + "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t" + "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t" + + "pcmpeqh %[ftmp0], %[ftmp8], %[ftmp0] \n\t" + "ldc1 %[ftmp9], %[ff_ph_01] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + + "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + "psubh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + "ldc1 %[ftmp9], %[ff_ph_07] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + MMI_LI(%[tmp0], 0x04) + "mtc1 %[tmp0], %[ftmp9] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + + MMI_LI(%[tmp0], 0x10) + "mtc1 %[tmp0], %[ftmp9] \n\t" + "ldc1 %[ftmp12], %[ff_pw_12000] \n\t" + "punpcklhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t" + "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op1] \n\t" + "punpckhhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t" + "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op1] \n\t" + "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" + "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t" + "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t" + "packsswh %[ftmp3], %[ftmp10], %[ftmp11] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + + "ldc1 %[ftmp12], %[ff_pw_51000] \n\t" + "punpcklhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t" + "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op3] \n\t" + "punpckhhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t" + "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op3] \n\t" + "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" + "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t" + "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t" + "packsswh %[ftmp4], %[ftmp10], %[ftmp11] \n\t" + + "gssdlc1 %[ftmp1], 0x07(%[output]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[output]) \n\t" + "gssdlc1 %[ftmp3], 0x0f(%[output]) \n\t" + "gssdrc1 %[ftmp3], 0x08(%[output]) \n\t" + "gssdlc1 %[ftmp2], 0x17(%[output]) \n\t" + "gssdrc1 %[ftmp2], 0x10(%[output]) \n\t" + "gssdlc1 %[ftmp4], 0x1f(%[output]) \n\t" + "gssdrc1 %[ftmp4], 0x18(%[output]) \n\t" + + : [ftmp0] "=&f"(ftmp0), [ftmp1] "=&f"(ftmp1), [ftmp2] "=&f"(ftmp2), + [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5), + [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8), + [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11), + [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip) + : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07), + [ff_ph_op1] "f"(ff_ph_op1), [ff_ph_op3] "f"(ff_ph_op3), + [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500), + [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000), + [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217), + [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output) + : "memory" + ); +} + +void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) { + vp8_short_fdct4x4_mmi(input, output, pitch); + vp8_short_fdct4x4_mmi(input + 4, output + 16, pitch); +} + +void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) { + double ftmp[13]; + uint32_t tmp[1]; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_01) = { 0x0000000100000001ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_03) = { 0x0000000300000003ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_mask) = { 0x0001000000010000ULL }; + + __asm__ volatile ( + MMI_LI(%[tmp0], 0x02) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + + "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t" + TRANSPOSE_4H + + "psllh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + "psllh %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + "psllh %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + "psllh %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + // a + "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + // d + "paddh %[ftmp6], %[ftmp2], %[ftmp4] \n\t" + // c + "psubh %[ftmp7], %[ftmp2], %[ftmp4] \n\t" + // b + "psubh %[ftmp8], %[ftmp1], %[ftmp3] \n\t" + + // a + d + "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + // b + c + "paddh %[ftmp2], %[ftmp8], %[ftmp7] \n\t" + // b - c + "psubh %[ftmp3], %[ftmp8], %[ftmp7] \n\t" + // a - d + "psubh %[ftmp4], %[ftmp5], %[ftmp6] \n\t" + + "pcmpeqh %[ftmp6], %[ftmp5], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ff_ph_01] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + TRANSPOSE_4H + + // op[2], op[0] + "pmaddhw %[ftmp5], %[ftmp1], %[ff_pw_01] \n\t" + // op[3], op[1] + "pmaddhw %[ftmp1], %[ftmp1], %[ff_pw_mask] \n\t" + + // op[6], op[4] + "pmaddhw %[ftmp6], %[ftmp2], %[ff_pw_01] \n\t" + // op[7], op[5] + "pmaddhw %[ftmp2], %[ftmp2], %[ff_pw_mask] \n\t" + + // op[10], op[8] + "pmaddhw %[ftmp7], %[ftmp3], %[ff_pw_01] \n\t" + // op[11], op[9] + "pmaddhw %[ftmp3], %[ftmp3], %[ff_pw_mask] \n\t" + + // op[14], op[12] + "pmaddhw %[ftmp8], %[ftmp4], %[ff_pw_01] \n\t" + // op[15], op[13] + "pmaddhw %[ftmp4], %[ftmp4], %[ff_pw_mask] \n\t" + + // a1, a3 + "paddw %[ftmp9], %[ftmp5], %[ftmp7] \n\t" + // d1, d3 + "paddw %[ftmp10], %[ftmp6], %[ftmp8] \n\t" + // c1, c3 + "psubw %[ftmp11], %[ftmp6], %[ftmp8] \n\t" + // b1, b3 + "psubw %[ftmp12], %[ftmp5], %[ftmp7] \n\t" + + // a1 + d1, a3 + d3 + "paddw %[ftmp5], %[ftmp9], %[ftmp10] \n\t" + // b1 + c1, b3 + c3 + "paddw %[ftmp6], %[ftmp12], %[ftmp11] \n\t" + // b1 - c1, b3 - c3 + "psubw %[ftmp7], %[ftmp12], %[ftmp11] \n\t" + // a1 - d1, a3 - d3 + "psubw %[ftmp8], %[ftmp9], %[ftmp10] \n\t" + + // a2, a4 + "paddw %[ftmp9], %[ftmp1], %[ftmp3] \n\t" + // d2, d4 + "paddw %[ftmp10], %[ftmp2], %[ftmp4] \n\t" + // c2, c4 + "psubw %[ftmp11], %[ftmp2], %[ftmp4] \n\t" + // b2, b4 + "psubw %[ftmp12], %[ftmp1], %[ftmp3] \n\t" + + // a2 + d2, a4 + d4 + "paddw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + // b2 + c2, b4 + c4 + "paddw %[ftmp2], %[ftmp12], %[ftmp11] \n\t" + // b2 - c2, b4 - c4 + "psubw %[ftmp3], %[ftmp12], %[ftmp11] \n\t" + // a2 - d2, a4 - d4 + "psubw %[ftmp4], %[ftmp9], %[ftmp10] \n\t" + + MMI_LI(%[tmp0], 0x03) + "mtc1 %[tmp0], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp1] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "paddw %[ftmp1], %[ftmp1], %[ff_pw_03] \n\t" + "psraw %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp2] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "paddw %[ftmp2], %[ftmp2], %[ff_pw_03] \n\t" + "psraw %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp3] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp9] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ff_pw_03] \n\t" + "psraw %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp4] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp4], %[ftmp4], %[ftmp9] \n\t" + "paddw %[ftmp4], %[ftmp4], %[ff_pw_03] \n\t" + "psraw %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp5] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + "paddw %[ftmp5], %[ftmp5], %[ff_pw_03] \n\t" + "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp6] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "paddw %[ftmp6], %[ftmp6], %[ff_pw_03] \n\t" + "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp7] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp7], %[ftmp7], %[ftmp9] \n\t" + "paddw %[ftmp7], %[ftmp7], %[ff_pw_03] \n\t" + "psraw %[ftmp7], %[ftmp7], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp8] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" + "paddw %[ftmp8], %[ftmp8], %[ff_pw_03] \n\t" + "psraw %[ftmp8], %[ftmp8], %[ftmp11] \n\t" + + "packsswh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "packsswh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "packsswh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "packsswh %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + + MMI_LI(%[tmp0], 0x72) + "mtc1 %[tmp0], %[ftmp11] \n\t" + "pshufh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + "pshufh %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + "pshufh %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + "pshufh %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + + "gssdlc1 %[ftmp1], 0x07(%[op]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[op]) \n\t" + "gssdlc1 %[ftmp2], 0x0f(%[op]) \n\t" + "gssdrc1 %[ftmp2], 0x08(%[op]) \n\t" + "gssdlc1 %[ftmp3], 0x17(%[op]) \n\t" + "gssdrc1 %[ftmp3], 0x10(%[op]) \n\t" + "gssdlc1 %[ftmp4], 0x1f(%[op]) \n\t" + "gssdrc1 %[ftmp4], 0x18(%[op]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), + [tmp0]"=&r"(tmp[0]), + [ip]"+&r"(input) + : [op]"r"(output), + [ff_pw_01]"f"(ff_pw_01), [pitch]"r"((mips_reg)pitch), + [ff_pw_03]"f"(ff_pw_03), [ff_pw_mask]"f"(ff_pw_mask), + [ff_ph_01]"f"(ff_ph_01) + : "memory" + ); +} diff --git a/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c b/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c new file mode 100644 index 000000000..3ccb196ff --- /dev/null +++ b/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/asmdefs_mmi.h" +#include "vp8/encoder/onyx_int.h" +#include "vp8/encoder/quantize.h" +#include "vp8/common/quant_common.h" + +#define REGULAR_SELECT_EOB(i, rc) \ + z = coeff_ptr[rc]; \ + sz = (z >> 31); \ + x = (z ^ sz) - sz; \ + zbin = zbin_ptr[rc] + *(zbin_boost_ptr++) + zbin_oq_value; \ + if (x >= zbin) { \ + x += round_ptr[rc]; \ + y = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16; \ + if (y) { \ + x = (y ^ sz) - sz; \ + qcoeff_ptr[rc] = x; \ + dqcoeff_ptr[rc] = x * dequant_ptr[rc]; \ + eob = i; \ + zbin_boost_ptr = b->zrun_zbin_boost; \ + } \ + } + +void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) { + const int16_t *coeff_ptr = b->coeff; + const int16_t *round_ptr = b->round; + const int16_t *quant_ptr = b->quant_fast; + int16_t *qcoeff_ptr = d->qcoeff; + int16_t *dqcoeff_ptr = d->dqcoeff; + const int16_t *dequant_ptr = d->dequant; + const int16_t *inv_zig_zag = vp8_default_inv_zig_zag; + + double ftmp[13]; + uint64_t tmp[1]; + DECLARE_ALIGNED(8, const uint64_t, ones) = { 0xffffffffffffffffULL }; + int eob = 0; + + __asm__ volatile( + // loop 0 ~ 7 + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[coeff_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[coeff_ptr]) \n\t" + "li %[tmp0], 0x0f \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[coeff_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[coeff_ptr]) \n\t" + + "psrah %[ftmp3], %[ftmp1], %[ftmp9] \n\t" + "xor %[ftmp1], %[ftmp3], %[ftmp1] \n\t" + "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "psrah %[ftmp4], %[ftmp2], %[ftmp9] \n\t" + "xor %[ftmp2], %[ftmp4], %[ftmp2] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + + "gsldlc1 %[ftmp5], 0x07(%[round_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[round_ptr]) \n\t" + "gsldlc1 %[ftmp6], 0x0f(%[round_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x08(%[round_ptr]) \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "gsldlc1 %[ftmp7], 0x07(%[quant_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[quant_ptr]) \n\t" + "gsldlc1 %[ftmp8], 0x0f(%[quant_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x08(%[quant_ptr]) \n\t" + "pmulhuh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmulhuh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + + "xor %[ftmp7], %[ftmp5], %[ftmp3] \n\t" + "xor %[ftmp8], %[ftmp6], %[ftmp4] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "psubh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" + "gssdlc1 %[ftmp7], 0x07(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp7], 0x00(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp8], 0x0f(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp8], 0x08(%[qcoeff_ptr]) \n\t" + + "gsldlc1 %[ftmp1], 0x07(%[inv_zig_zag]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[inv_zig_zag]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[inv_zig_zag]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[inv_zig_zag]) \n\t" + "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "xor %[ftmp5], %[ftmp5], %[ones] \n\t" + "xor %[ftmp6], %[ftmp6], %[ones] \n\t" + "and %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "and %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "pmaxsh %[ftmp10], %[ftmp5], %[ftmp6] \n\t" + + "gsldlc1 %[ftmp5], 0x07(%[dequant_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[dequant_ptr]) \n\t" + "gsldlc1 %[ftmp6], 0x0f(%[dequant_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x08(%[dequant_ptr]) \n\t" + "pmullh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "gssdlc1 %[ftmp5], 0x07(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp5], 0x00(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp6], 0x0f(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp6], 0x08(%[dqcoeff_ptr]) \n\t" + + // loop 8 ~ 15 + "gsldlc1 %[ftmp1], 0x17(%[coeff_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[coeff_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[coeff_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[coeff_ptr]) \n\t" + + "psrah %[ftmp3], %[ftmp1], %[ftmp9] \n\t" + "xor %[ftmp1], %[ftmp3], %[ftmp1] \n\t" + "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "psrah %[ftmp4], %[ftmp2], %[ftmp9] \n\t" + "xor %[ftmp2], %[ftmp4], %[ftmp2] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + + "gsldlc1 %[ftmp5], 0x17(%[round_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x10(%[round_ptr]) \n\t" + "gsldlc1 %[ftmp6], 0x1f(%[round_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x18(%[round_ptr]) \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "gsldlc1 %[ftmp7], 0x17(%[quant_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x10(%[quant_ptr]) \n\t" + "gsldlc1 %[ftmp8], 0x1f(%[quant_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x18(%[quant_ptr]) \n\t" + "pmulhuh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmulhuh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + + "xor %[ftmp7], %[ftmp5], %[ftmp3] \n\t" + "xor %[ftmp8], %[ftmp6], %[ftmp4] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "psubh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" + "gssdlc1 %[ftmp7], 0x17(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp7], 0x10(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp8], 0x1f(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp8], 0x18(%[qcoeff_ptr]) \n\t" + + "gsldlc1 %[ftmp1], 0x17(%[inv_zig_zag]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[inv_zig_zag]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[inv_zig_zag]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[inv_zig_zag]) \n\t" + "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "xor %[ftmp5], %[ftmp5], %[ones] \n\t" + "xor %[ftmp6], %[ftmp6], %[ones] \n\t" + "and %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "and %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "pmaxsh %[ftmp11], %[ftmp5], %[ftmp6] \n\t" + + "gsldlc1 %[ftmp5], 0x17(%[dequant_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x10(%[dequant_ptr]) \n\t" + "gsldlc1 %[ftmp6], 0x1f(%[dequant_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x18(%[dequant_ptr]) \n\t" + "pmullh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "gssdlc1 %[ftmp5], 0x17(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp5], 0x10(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp6], 0x1f(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp6], 0x18(%[dqcoeff_ptr]) \n\t" + + "li %[tmp0], 0x10 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + + "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "psrlw %[ftmp11], %[ftmp10], %[ftmp9] \n\t" + "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "li %[tmp0], 0xaa \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "pshufh %[ftmp11], %[ftmp10], %[ftmp9] \n\t" + "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "li %[tmp0], 0xffff \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "and %[ftmp10], %[ftmp10], %[ftmp9] \n\t" + "gssdlc1 %[ftmp10], 0x07(%[eob]) \n\t" + "gssdrc1 %[ftmp10], 0x00(%[eob]) \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]), + [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), + [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) + : [coeff_ptr] "r"((mips_reg)coeff_ptr), + [qcoeff_ptr] "r"((mips_reg)qcoeff_ptr), + [dequant_ptr] "r"((mips_reg)dequant_ptr), + [round_ptr] "r"((mips_reg)round_ptr), + [quant_ptr] "r"((mips_reg)quant_ptr), + [dqcoeff_ptr] "r"((mips_reg)dqcoeff_ptr), + [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob), + [ones] "f"(ones) + : "memory"); + + *d->eob = eob; +} + +void vp8_regular_quantize_b_mmi(BLOCK *b, BLOCKD *d) { + int eob = 0; + int x, y, z, sz, zbin; + const int16_t *zbin_boost_ptr = b->zrun_zbin_boost; + const int16_t *coeff_ptr = b->coeff; + const int16_t *zbin_ptr = b->zbin; + const int16_t *round_ptr = b->round; + const int16_t *quant_ptr = b->quant; + const int16_t *quant_shift_ptr = b->quant_shift; + int16_t *qcoeff_ptr = d->qcoeff; + int16_t *dqcoeff_ptr = d->dqcoeff; + const int16_t *dequant_ptr = d->dequant; + const int16_t zbin_oq_value = b->zbin_extra; + register double ftmp0 asm("$f0"); + + // memset(qcoeff_ptr, 0, 32); + // memset(dqcoeff_ptr, 0, 32); + /* clang-format off */ + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x0f(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x08(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x17(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x10(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x1f(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x18(%[qcoeff_ptr]) \n\t" + + "gssdlc1 %[ftmp0], 0x07(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x0f(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x08(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x17(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x10(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x1f(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x18(%[dqcoeff_ptr]) \n\t" + : [ftmp0]"=&f"(ftmp0) + : [qcoeff_ptr]"r"(qcoeff_ptr), [dqcoeff_ptr]"r"(dqcoeff_ptr) + : "memory" + ); + /* clang-format on */ + + REGULAR_SELECT_EOB(1, 0); + REGULAR_SELECT_EOB(2, 1); + REGULAR_SELECT_EOB(3, 4); + REGULAR_SELECT_EOB(4, 8); + REGULAR_SELECT_EOB(5, 5); + REGULAR_SELECT_EOB(6, 2); + REGULAR_SELECT_EOB(7, 3); + REGULAR_SELECT_EOB(8, 6); + REGULAR_SELECT_EOB(9, 9); + REGULAR_SELECT_EOB(10, 12); + REGULAR_SELECT_EOB(11, 13); + REGULAR_SELECT_EOB(12, 10); + REGULAR_SELECT_EOB(13, 7); + REGULAR_SELECT_EOB(14, 11); + REGULAR_SELECT_EOB(15, 14); + REGULAR_SELECT_EOB(16, 15); + + *d->eob = (char)eob; +} diff --git a/libvpx/vp8/encoder/onyx_if.c b/libvpx/vp8/encoder/onyx_if.c index b571d29d9..224318242 100644 --- a/libvpx/vp8/encoder/onyx_if.c +++ b/libvpx/vp8/encoder/onyx_if.c @@ -12,10 +12,12 @@ #include "./vpx_scale_rtcd.h" #include "./vpx_dsp_rtcd.h" #include "./vp8_rtcd.h" +#include "bitstream.h" #include "vp8/common/onyxc_int.h" #include "vp8/common/blockd.h" #include "onyx_int.h" #include "vp8/common/systemdependent.h" +#include "vp8/common/vp8_skin_detection.h" #include "vp8/encoder/quantize.h" #include "vp8/common/alloccommon.h" #include "mcomp.h" @@ -35,6 +37,7 @@ #include "vp8/common/threading.h" #include "vpx_ports/system_state.h" #include "vpx_ports/vpx_timer.h" +#include "vpx_util/vpx_write_yuv_frame.h" #if ARCH_ARM #include "vpx_ports/arm.h" #endif @@ -42,6 +45,13 @@ #include "mr_dissim.h" #endif #include "encodeframe.h" +#if CONFIG_MULTITHREAD +#include "ethreading.h" +#endif +#include "picklpf.h" +#if !CONFIG_REALTIME_ONLY +#include "temporal_filter.h" +#endif #include <assert.h> #include <math.h> @@ -50,28 +60,17 @@ #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING extern int vp8_update_coef_context(VP8_COMP *cpi); -extern void vp8_update_coef_probs(VP8_COMP *cpi); #endif -extern void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi); -extern void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val); -extern void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi); - extern void vp8_deblock_frame(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post, int filt_lvl, int low_var_thresh, int flag); extern void print_parms(VP8_CONFIG *ocf, char *filenam); extern unsigned int vp8_get_processor_freq(); extern void print_tree_update_probs(); -extern int vp8cx_create_encoder_threads(VP8_COMP *cpi); -extern void vp8cx_remove_encoder_threads(VP8_COMP *cpi); - -int vp8_estimate_entropy_savings(VP8_COMP *cpi); int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest); -extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi, int distance); - static void set_default_lf_deltas(VP8_COMP *cpi); extern const int vp8_gf_interval_table[101]; @@ -87,6 +86,9 @@ FILE *yuv_file; #ifdef OUTPUT_YUV_DENOISED FILE *yuv_denoised_file; #endif +#ifdef OUTPUT_YUV_SKINMAP +static FILE *yuv_skinmap_file = NULL; +#endif #if 0 FILE *framepsnr; @@ -219,7 +221,8 @@ static void save_layer_context(VP8_COMP *cpi) { lc->inter_frame_target = cpi->inter_frame_target; lc->total_byte_count = cpi->total_byte_count; lc->filter_level = cpi->common.filter_level; - + lc->frames_since_last_drop_overshoot = cpi->frames_since_last_drop_overshoot; + lc->force_maxqp = cpi->force_maxqp; lc->last_frame_percent_intra = cpi->last_frame_percent_intra; memcpy(lc->count_mb_ref_frame_usage, cpi->mb.count_mb_ref_frame_usage, @@ -255,7 +258,8 @@ static void restore_layer_context(VP8_COMP *cpi, const int layer) { cpi->inter_frame_target = lc->inter_frame_target; cpi->total_byte_count = lc->total_byte_count; cpi->common.filter_level = lc->filter_level; - + cpi->frames_since_last_drop_overshoot = lc->frames_since_last_drop_overshoot; + cpi->force_maxqp = lc->force_maxqp; cpi->last_frame_percent_intra = lc->last_frame_percent_intra; memcpy(cpi->mb.count_mb_ref_frame_usage, lc->count_mb_ref_frame_usage, @@ -447,18 +451,6 @@ static void dealloc_compressor_data(VP8_COMP *cpi) { cpi->mb.pip = 0; #if CONFIG_MULTITHREAD - /* De-allocate mutex */ - if (cpi->pmutex != NULL) { - VP8_COMMON *const pc = &cpi->common; - int i; - - for (i = 0; i < pc->mb_rows; ++i) { - pthread_mutex_destroy(&cpi->pmutex[i]); - } - vpx_free(cpi->pmutex); - cpi->pmutex = NULL; - } - vpx_free(cpi->mt_current_mb_col); cpi->mt_current_mb_col = NULL; #endif @@ -616,6 +608,59 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment) { set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA); } +static void compute_skin_map(VP8_COMP *cpi) { + int mb_row, mb_col, num_bl; + VP8_COMMON *cm = &cpi->common; + const uint8_t *src_y = cpi->Source->y_buffer; + const uint8_t *src_u = cpi->Source->u_buffer; + const uint8_t *src_v = cpi->Source->v_buffer; + const int src_ystride = cpi->Source->y_stride; + const int src_uvstride = cpi->Source->uv_stride; + + const SKIN_DETECTION_BLOCK_SIZE bsize = + (cm->Width * cm->Height <= 352 * 288) ? SKIN_8X8 : SKIN_16X16; + + for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { + num_bl = 0; + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { + const int bl_index = mb_row * cm->mb_cols + mb_col; + cpi->skin_map[bl_index] = + vp8_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride, + bsize, cpi->consec_zero_last[bl_index], 0); + num_bl++; + src_y += 16; + src_u += 8; + src_v += 8; + } + src_y += (src_ystride << 4) - (num_bl << 4); + src_u += (src_uvstride << 3) - (num_bl << 3); + src_v += (src_uvstride << 3) - (num_bl << 3); + } + + // Remove isolated skin blocks (none of its neighbors are skin) and isolated + // non-skin blocks (all of its neighbors are skin). Skip the boundary. + for (mb_row = 1; mb_row < cm->mb_rows - 1; mb_row++) { + for (mb_col = 1; mb_col < cm->mb_cols - 1; mb_col++) { + const int bl_index = mb_row * cm->mb_cols + mb_col; + int num_neighbor = 0; + int mi, mj; + int non_skin_threshold = 8; + + for (mi = -1; mi <= 1; mi += 1) { + for (mj = -1; mj <= 1; mj += 1) { + int bl_neighbor_index = (mb_row + mi) * cm->mb_cols + mb_col + mj; + if (cpi->skin_map[bl_neighbor_index]) num_neighbor++; + } + } + + if (cpi->skin_map[bl_index] && num_neighbor < 2) + cpi->skin_map[bl_index] = 0; + if (!cpi->skin_map[bl_index] && num_neighbor == non_skin_threshold) + cpi->skin_map[bl_index] = 1; + } + } +} + static void set_default_lf_deltas(VP8_COMP *cpi) { cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1; cpi->mb.e_mbd.mode_ref_lf_delta_update = 1; @@ -1096,9 +1141,6 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) { int width = cm->Width; int height = cm->Height; -#if CONFIG_MULTITHREAD - int prev_mb_rows = cm->mb_rows; -#endif if (vp8_alloc_frame_buffers(cm, width, height)) { vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, @@ -1190,26 +1232,11 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) { if (cpi->oxcf.multi_threaded > 1) { int i; - /* De-allocate and re-allocate mutex */ - if (cpi->pmutex != NULL) { - for (i = 0; i < prev_mb_rows; ++i) { - pthread_mutex_destroy(&cpi->pmutex[i]); - } - vpx_free(cpi->pmutex); - cpi->pmutex = NULL; - } - - CHECK_MEM_ERROR(cpi->pmutex, - vpx_malloc(sizeof(*cpi->pmutex) * cm->mb_rows)); - if (cpi->pmutex) { - for (i = 0; i < cm->mb_rows; ++i) { - pthread_mutex_init(&cpi->pmutex[i], NULL); - } - } - vpx_free(cpi->mt_current_mb_col); CHECK_MEM_ERROR(cpi->mt_current_mb_col, vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows)); + for (i = 0; i < cm->mb_rows; ++i) + vpx_atomic_init(&cpi->mt_current_mb_col[i], 0); } #endif @@ -1526,9 +1553,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { setup_features(cpi); - { + if (!cpi->use_roi_static_threshold) { int i; - for (i = 0; i < MAX_MB_SEGMENTS; ++i) { cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout; } @@ -1788,6 +1814,8 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->active_map_enabled = 0; + cpi->use_roi_static_threshold = 0; + #if 0 /* Experimental code for lagged and one pass */ /* Initialise one_pass GF frames stats */ @@ -1857,6 +1885,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->cyclic_refresh_map = (signed char *)NULL; } + CHECK_MEM_ERROR(cpi->skin_map, vpx_calloc(cm->mb_rows * cm->mb_cols, + sizeof(cpi->skin_map[0]))); + CHECK_MEM_ERROR(cpi->consec_zero_last, vpx_calloc(cm->mb_rows * cm->mb_cols, 1)); CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias, @@ -1880,6 +1911,7 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->common.refresh_alt_ref_frame = 0; cpi->force_maxqp = 0; + cpi->frames_since_last_drop_overshoot = 0; cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; #if CONFIG_INTERNAL_STATS @@ -1933,6 +1965,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { #ifdef OUTPUT_YUV_DENOISED yuv_denoised_file = fopen("denoised.yuv", "ab"); #endif +#ifdef OUTPUT_YUV_SKINMAP + yuv_skinmap_file = fopen("skinmap.yuv", "wb"); +#endif #if 0 framepsnr = fopen("framepsnr.stt", "a"); @@ -2284,6 +2319,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) { dealloc_compressor_data(cpi); vpx_free(cpi->mb.ss); vpx_free(cpi->tok); + vpx_free(cpi->skin_map); vpx_free(cpi->cyclic_refresh_map); vpx_free(cpi->consec_zero_last); vpx_free(cpi->consec_zero_last_mvbias); @@ -2298,6 +2334,9 @@ void vp8_remove_compressor(VP8_COMP **ptr) { #ifdef OUTPUT_YUV_DENOISED fclose(yuv_denoised_file); #endif +#ifdef OUTPUT_YUV_SKINMAP + fclose(yuv_skinmap_file); +#endif #if 0 @@ -2474,34 +2513,6 @@ int vp8_update_entropy(VP8_COMP *cpi, int update) { return 0; } -#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED) -void vp8_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) { - unsigned char *src = s->y_buffer; - int h = s->y_height; - - do { - fwrite(src, s->y_width, 1, yuv_file); - src += s->y_stride; - } while (--h); - - src = s->u_buffer; - h = s->uv_height; - - do { - fwrite(src, s->uv_width, 1, yuv_file); - src += s->uv_stride; - } while (--h); - - src = s->v_buffer; - h = s->uv_height; - - do { - fwrite(src, s->uv_width, 1, yuv_file); - src += s->uv_stride; - } while (--h); -} -#endif - static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) { VP8_COMMON *cm = &cpi->common; @@ -2914,8 +2925,7 @@ static void update_reference_frames(VP8_COMP *cpi) { cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame; cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame; - } else /* For non key frames */ - { + } else { if (cm->refresh_alt_ref_frame) { assert(!cm->copy_buffer_to_arf); @@ -2936,8 +2946,7 @@ static void update_reference_frames(VP8_COMP *cpi) { cpi->current_ref_frames[ALTREF_FRAME] = cpi->current_ref_frames[LAST_FRAME]; } - } else /* if (cm->copy_buffer_to_arf == 2) */ - { + } else { if (cm->alt_fb_idx != cm->gld_fb_idx) { yv12_fb[cm->gld_fb_idx].flags |= VP8_ALTR_FRAME; yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME; @@ -2969,8 +2978,7 @@ static void update_reference_frames(VP8_COMP *cpi) { cpi->current_ref_frames[GOLDEN_FRAME] = cpi->current_ref_frames[LAST_FRAME]; } - } else /* if (cm->copy_buffer_to_gf == 2) */ - { + } else { if (cm->alt_fb_idx != cm->gld_fb_idx) { yv12_fb[cm->alt_fb_idx].flags |= VP8_GOLD_FRAME; yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME; @@ -3001,8 +3009,7 @@ static void update_reference_frames(VP8_COMP *cpi) { int i; for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i) vp8_yv12_copy_frame(cpi->Source, &cpi->denoiser.yv12_running_avg[i]); - } else /* For non key frames */ - { + } else { vp8_yv12_extend_frame_borders( &cpi->denoiser.yv12_running_avg[INTRA_FRAME]); @@ -3234,7 +3241,7 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) { } #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */ } #endif @@ -3788,6 +3795,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } #endif + compute_skin_map(cpi); + /* Setup background Q adjustment for error resilient mode. * For multi-layer encodes only enable this for the base layer. */ @@ -3861,7 +3870,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #endif #ifdef OUTPUT_YUV_SRC - vp8_write_yuv_frame(yuv_file, cpi->Source); + vpx_write_yuv_frame(yuv_file, cpi->Source); #endif do { @@ -3989,7 +3998,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #else /* transform / motion compensation build reconstruction frame */ vp8_encode_frame(cpi); - if (cpi->oxcf.screen_content_mode == 2) { + + if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { if (vp8_drop_encodedframe_overshoot(cpi, Q)) return; } @@ -4421,11 +4431,20 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } #endif +#ifdef OUTPUT_YUV_SKINMAP + if (cpi->common.current_video_frame > 1) { + vp8_compute_skin_map(cpi, yuv_skinmap_file); + } +#endif + #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { /* start loopfilter in separate thread */ sem_post(&cpi->h_event_start_lpf); cpi->b_lpf_running = 1; + /* wait for the filter_level to be picked so that we can continue with + * stream packing */ + sem_wait(&cpi->h_event_end_lpf); } else #endif { @@ -4435,7 +4454,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, update_reference_frames(cpi); #ifdef OUTPUT_YUV_DENOISED - vp8_write_yuv_frame(yuv_denoised_file, + vpx_write_yuv_frame(yuv_denoised_file, &cpi->denoiser.yv12_running_avg[INTRA_FRAME]); #endif @@ -4445,12 +4464,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } #endif -#if CONFIG_MULTITHREAD - /* wait that filter_level is picked so that we can continue with stream - * packing */ - if (cpi->b_multi_threaded) sem_wait(&cpi->h_event_end_lpf); -#endif - /* build the bitstream */ vp8_pack_bitstream(cpi, dest, dest_end, size); @@ -4784,7 +4797,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #endif /* DEBUG */ - /* vp8_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */ + /* vpx_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */ } #if !CONFIG_REALTIME_ONLY static void Pass2Encode(VP8_COMP *cpi, size_t *size, unsigned char *dest, @@ -5292,7 +5305,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, #if CONFIG_MULTITHREAD /* wait for the lpf thread done */ - if (cpi->b_multi_threaded && cpi->b_lpf_running) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) { sem_wait(&cpi->h_event_end_lpf); cpi->b_lpf_running = 0; } @@ -5338,9 +5351,6 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, const int range = 63; int i; - // This method is currently incompatible with the cyclic refresh method - if (cpi->cyclic_refresh_mode_enabled) return -1; - // Check number of rows and columns match if (cpi->common.mb_rows != (int)rows || cpi->common.mb_cols != (int)cols) { return -1; @@ -5359,7 +5369,11 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, return -1; } - if (!map) { + // Also disable segmentation if no deltas are specified. + if (!map || (delta_q[0] == 0 && delta_q[1] == 0 && delta_q[2] == 0 && + delta_q[3] == 0 && delta_lf[0] == 0 && delta_lf[1] == 0 && + delta_lf[2] == 0 && delta_lf[3] == 0 && threshold[0] == 0 && + threshold[1] == 0 && threshold[2] == 0 && threshold[3] == 0)) { disable_segmentation(cpi); return 0; } @@ -5396,6 +5410,11 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, /* Initialise the feature data structure */ set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA); + if (threshold[0] != 0 || threshold[1] != 0 || threshold[2] != 0 || + threshold[3] != 0) + cpi->use_roi_static_threshold = 1; + cpi->cyclic_refresh_mode_enabled = 0; + return 0; } diff --git a/libvpx/vp8/encoder/onyx_int.h b/libvpx/vp8/encoder/onyx_int.h index fe775064a..c489b46c2 100644 --- a/libvpx/vp8/encoder/onyx_int.h +++ b/libvpx/vp8/encoder/onyx_int.h @@ -249,6 +249,10 @@ typedef struct { int filter_level; + int frames_since_last_drop_overshoot; + + int force_maxqp; + int last_frame_percent_intra; int count_mb_ref_frame_usage[MAX_REF_FRAMES]; @@ -471,6 +475,8 @@ typedef struct VP8_COMP { int zeromv_count; int lf_zeromv_pct; + unsigned char *skin_map; + unsigned char *segmentation_map; signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; int segment_encode_breakout[MAX_MB_SEGMENTS]; @@ -503,6 +509,7 @@ typedef struct VP8_COMP { int mse_source_denoised; int force_maxqp; + int frames_since_last_drop_overshoot; // GF update for 1 pass cbr. int gf_update_onepass_cbr; @@ -511,11 +518,9 @@ typedef struct VP8_COMP { #if CONFIG_MULTITHREAD /* multithread data */ - pthread_mutex_t *pmutex; - pthread_mutex_t mt_mutex; /* mutex for b_multi_threaded */ - int *mt_current_mb_col; + vpx_atomic_int *mt_current_mb_col; int mt_sync_range; - int b_multi_threaded; + vpx_atomic_int b_multi_threaded; int encoding_thread_count; int b_lpf_running; @@ -687,6 +692,9 @@ typedef struct VP8_COMP { int token_costs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; } rd_costs; + + // Use the static threshold from ROI settings. + int use_roi_static_threshold; } VP8_COMP; void vp8_initialize_enc(void); diff --git a/libvpx/vp8/encoder/pickinter.c b/libvpx/vp8/encoder/pickinter.c index eb713f11c..a9943eb6a 100644 --- a/libvpx/vp8/encoder/pickinter.c +++ b/libvpx/vp8/encoder/pickinter.c @@ -25,6 +25,7 @@ #include "vp8/common/reconintra4x4.h" #include "vpx_dsp/variance.h" #include "mcomp.h" +#include "vp8/common/vp8_skin_detection.h" #include "rdopt.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" @@ -36,82 +37,9 @@ extern unsigned int cnt_pm; #endif -#define MODEL_MODE 1 - extern const int vp8_ref_frame_order[MAX_MODES]; extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES]; -// Fixed point implementation of a skin color classifier. Skin color -// is model by a Gaussian distribution in the CbCr color space. -// See ../../test/skin_color_detector_test.cc where the reference -// skin color classifier is defined. - -// Fixed-point skin color model parameters. -static const int skin_mean[5][2] = { { 7463, 9614 }, - { 6400, 10240 }, - { 7040, 10240 }, - { 8320, 9280 }, - { 6800, 9614 } }; -static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 }; // q16 -static const int skin_threshold[6] = { 1570636, 1400000, 800000, - 800000, 800000, 800000 }; // q18 - -// Evaluates the Mahalanobis distance measure for the input CbCr values. -static int evaluate_skin_color_difference(int cb, int cr, int idx) { - const int cb_q6 = cb << 6; - const int cr_q6 = cr << 6; - const int cb_diff_q12 = - (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]); - const int cbcr_diff_q12 = - (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]); - const int cr_diff_q12 = - (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]); - const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10; - const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10; - const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10; - const int skin_diff = - skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 + - skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2; - return skin_diff; -} - -// Checks if the input yCbCr values corresponds to skin color. -static int is_skin_color(int y, int cb, int cr, int consec_zeromv) { - if (y < 40 || y > 220) { - return 0; - } else { - if (MODEL_MODE == 0) { - return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]); - } else { - int i = 0; - // No skin if block has been zero motion for long consecutive time. - if (consec_zeromv > 60) return 0; - // Exit on grey. - if (cb == 128 && cr == 128) return 0; - // Exit on very strong cb. - if (cb > 150 && cr < 110) return 0; - for (; i < 5; ++i) { - int skin_color_diff = evaluate_skin_color_difference(cb, cr, i); - if (skin_color_diff < skin_threshold[i + 1]) { - if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) { - return 0; - } else if (consec_zeromv > 25 && - skin_color_diff > (skin_threshold[i + 1] >> 1)) { - return 0; - } else { - return 1; - } - } - // Exit if difference is much large than the threshold. - if (skin_color_diff > (skin_threshold[i + 1] << 3)) { - return 0; - } - } - return 0; - } - } -} - static int macroblock_corner_grad(unsigned char *signal, int stride, int offsetx, int offsety, int sgnx, int sgny) { @@ -760,27 +688,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, #endif // Check if current macroblock is in skin area. - { - const int y = (x->src.y_buffer[7 * x->src.y_stride + 7] + - x->src.y_buffer[7 * x->src.y_stride + 8] + - x->src.y_buffer[8 * x->src.y_stride + 7] + - x->src.y_buffer[8 * x->src.y_stride + 8]) >> - 2; - const int cb = (x->src.u_buffer[3 * x->src.uv_stride + 3] + - x->src.u_buffer[3 * x->src.uv_stride + 4] + - x->src.u_buffer[4 * x->src.uv_stride + 3] + - x->src.u_buffer[4 * x->src.uv_stride + 4]) >> - 2; - const int cr = (x->src.v_buffer[3 * x->src.uv_stride + 3] + - x->src.v_buffer[3 * x->src.uv_stride + 4] + - x->src.v_buffer[4 * x->src.uv_stride + 3] + - x->src.v_buffer[4 * x->src.uv_stride + 4]) >> - 2; - x->is_skin = 0; - if (!cpi->oxcf.screen_content_mode) { - int block_index = mb_row * cpi->common.mb_cols + mb_col; - x->is_skin = is_skin_color(y, cb, cr, cpi->consec_zero_last[block_index]); - } + x->is_skin = 0; + if (!cpi->oxcf.screen_content_mode) { + int block_index = mb_row * cpi->common.mb_cols + mb_col; + x->is_skin = cpi->skin_map[block_index]; } #if CONFIG_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity) { diff --git a/libvpx/vp8/encoder/picklpf.c b/libvpx/vp8/encoder/picklpf.c index 6f287322e..b1b712db9 100644 --- a/libvpx/vp8/encoder/picklpf.c +++ b/libvpx/vp8/encoder/picklpf.c @@ -12,6 +12,7 @@ #include "./vpx_scale_rtcd.h" #include "vp8/common/onyxc_int.h" #include "onyx_int.h" +#include "vp8/encoder/picklpf.h" #include "vp8/encoder/quantize.h" #include "vpx_mem/vpx_mem.h" #include "vpx_scale/vpx_scale.h" diff --git a/libvpx/vp8/encoder/picklpf.h b/libvpx/vp8/encoder/picklpf.h new file mode 100644 index 000000000..e6ad0dbf2 --- /dev/null +++ b/libvpx/vp8/encoder/picklpf.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_ENCODER_PICKLPF_H_ +#define VP8_ENCODER_PICKLPF_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8_COMP; +struct yv12_buffer_config; + +void vp8cx_pick_filter_level_fast(struct yv12_buffer_config *sd, + struct VP8_COMP *cpi); +void vp8cx_set_alt_lf_level(struct VP8_COMP *cpi, int filt_val); +void vp8cx_pick_filter_level(struct yv12_buffer_config *sd, VP8_COMP *cpi); + +#ifdef __cplusplus +} +#endif + +#endif // VP8_ENCODER_PICKLPF_H_ diff --git a/libvpx/vp8/encoder/ratectrl.c b/libvpx/vp8/encoder/ratectrl.c index e89247ae4..e58c31098 100644 --- a/libvpx/vp8/encoder/ratectrl.c +++ b/libvpx/vp8/encoder/ratectrl.c @@ -498,11 +498,9 @@ static void calc_gf_params(VP8_COMP *cpi) { * This is updated once the real frame size/boost is known. */ if (cpi->oxcf.fixed_q == -1) { - if (cpi->pass == 2) /* 2 Pass */ - { + if (cpi->pass == 2) { /* 2 Pass */ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; - } else /* 1 Pass */ - { + } else { /* 1 Pass */ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; if (cpi->last_boost > 750) cpi->frames_till_gf_update_due++; @@ -1442,12 +1440,33 @@ int vp8_pick_frame_size(VP8_COMP *cpi) { // If this just encoded frame (mcomp/transform/quant, but before loopfilter and // pack_bitstream) has large overshoot, and was not being encoded close to the // max QP, then drop this frame and force next frame to be encoded at max QP. -// Condition this on 1 pass CBR with screen content mode and frame dropper off. +// Allow this for screen_content_mode = 2, or if drop frames is allowed. // TODO(marpan): Should do this exit condition during the encode_frame // (i.e., halfway during the encoding of the frame) to save cycles. int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) { - if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && - cpi->drop_frames_allowed == 0 && cpi->common.frame_type != KEY_FRAME) { + int force_drop_overshoot = 0; +#if CONFIG_MULTI_RES_ENCODING + // Only check for dropping due to overshoot on the lowest stream. + // If the lowest stream of the multi-res encoding was dropped due to + // overshoot, then force dropping on all upper layer streams + // (mr_encoder_id > 0). + LOWER_RES_FRAME_INFO *low_res_frame_info = + (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info; + if (cpi->oxcf.mr_total_resolutions > 1 && cpi->oxcf.mr_encoder_id > 0) { + force_drop_overshoot = low_res_frame_info->is_frame_dropped_overshoot_maxqp; + if (!force_drop_overshoot) { + cpi->force_maxqp = 0; + cpi->frames_since_last_drop_overshoot++; + return 0; + } + } +#endif + if (cpi->common.frame_type != KEY_FRAME && + (cpi->oxcf.screen_content_mode == 2 || + (cpi->drop_frames_allowed && + (force_drop_overshoot || + (cpi->rate_correction_factor < (4.0f * MIN_BPB_FACTOR) && + cpi->frames_since_last_drop_overshoot > (int)cpi->framerate))))) { // Note: the "projected_frame_size" from encode_frame() only gives estimate // of mode/motion vector rate (in non-rd mode): so below we only require // that projected_frame_size is somewhat greater than per-frame-bandwidth, @@ -1458,17 +1477,20 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) { // Rate threshold, in bytes. int thresh_rate = 2 * (cpi->av_per_frame_bandwidth >> 3); // Threshold for the average (over all macroblocks) of the pixel-sum - // residual error over 16x16 block. Should add QP dependence on threshold? - int thresh_pred_err_mb = (256 << 4); + // residual error over 16x16 block. + int thresh_pred_err_mb = (200 << 4); int pred_err_mb = (int)(cpi->mb.prediction_error / cpi->common.MBs); - if (Q < thresh_qp && cpi->projected_frame_size > thresh_rate && - pred_err_mb > thresh_pred_err_mb) { + // Reduce/ignore thresh_rate if pred_err_mb much larger than its threshold, + // give more weight to pred_err metric for overshoot detection. + if (cpi->drop_frames_allowed && pred_err_mb > (thresh_pred_err_mb << 4)) + thresh_rate = thresh_rate >> 3; + if ((Q < thresh_qp && cpi->projected_frame_size > thresh_rate && + pred_err_mb > thresh_pred_err_mb) || + force_drop_overshoot) { + unsigned int i; double new_correction_factor; - const int target_size = cpi->av_per_frame_bandwidth; int target_bits_per_mb; - // Drop this frame: advance frame counters, and set force_maxqp flag. - cpi->common.current_video_frame++; - cpi->frames_since_key++; + const int target_size = cpi->av_per_frame_bandwidth; // Flag to indicate we will force next frame to be encoded at max QP. cpi->force_maxqp = 1; // Reset the buffer levels. @@ -1499,14 +1521,40 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) { if (cpi->rate_correction_factor > MAX_BPB_FACTOR) { cpi->rate_correction_factor = MAX_BPB_FACTOR; } + // Drop this frame: update frame counters. + cpi->common.current_video_frame++; + cpi->frames_since_key++; + cpi->temporal_pattern_counter++; + cpi->frames_since_last_drop_overshoot = 0; + if (cpi->oxcf.number_of_layers > 1) { + // Set max_qp and rate correction for all temporal layers if overshoot + // is detected. + for (i = 0; i < cpi->oxcf.number_of_layers; ++i) { + LAYER_CONTEXT *lc = &cpi->layer_context[i]; + lc->force_maxqp = 1; + lc->frames_since_last_drop_overshoot = 0; + lc->rate_correction_factor = cpi->rate_correction_factor; + } + } +#if CONFIG_MULTI_RES_ENCODING + if (cpi->oxcf.mr_total_resolutions > 1) + low_res_frame_info->is_frame_dropped_overshoot_maxqp = 1; +#endif return 1; - } else { - cpi->force_maxqp = 0; - return 0; } cpi->force_maxqp = 0; + cpi->frames_since_last_drop_overshoot++; +#if CONFIG_MULTI_RES_ENCODING + if (cpi->oxcf.mr_total_resolutions > 1) + low_res_frame_info->is_frame_dropped_overshoot_maxqp = 0; +#endif return 0; } cpi->force_maxqp = 0; + cpi->frames_since_last_drop_overshoot++; +#if CONFIG_MULTI_RES_ENCODING + if (cpi->oxcf.mr_total_resolutions > 1) + low_res_frame_info->is_frame_dropped_overshoot_maxqp = 0; +#endif return 0; } diff --git a/libvpx/vp8/encoder/rdopt.c b/libvpx/vp8/encoder/rdopt.c index 3792b10f8..e210b4410 100644 --- a/libvpx/vp8/encoder/rdopt.c +++ b/libvpx/vp8/encoder/rdopt.c @@ -16,12 +16,14 @@ #include "vpx_config.h" #include "vp8_rtcd.h" #include "./vpx_dsp_rtcd.h" +#include "encodeframe.h" #include "tokenize.h" #include "treewriter.h" #include "onyx_int.h" #include "modecosts.h" #include "encodeintra.h" #include "pickinter.h" +#include "vp8/common/common.h" #include "vp8/common/entropymode.h" #include "vp8/common/reconinter.h" #include "vp8/common/reconintra.h" @@ -852,8 +854,7 @@ static int labels2mode(MACROBLOCK *x, int const *labelings, int which_label, default: break; } - if (m == ABOVE4X4) /* replace above with left if same */ - { + if (m == ABOVE4X4) { /* replace above with left if same */ int_mv left_mv; left_mv.as_int = col ? d[-1].bmi.mv.as_int : left_block_mv(mic, i); @@ -959,19 +960,13 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, vp8_variance_fn_ptr_t *v_fn_ptr; ENTROPY_CONTEXT_PLANES t_above, t_left; - ENTROPY_CONTEXT *ta; - ENTROPY_CONTEXT *tl; ENTROPY_CONTEXT_PLANES t_above_b, t_left_b; - ENTROPY_CONTEXT *ta_b; - ENTROPY_CONTEXT *tl_b; memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); - ta = (ENTROPY_CONTEXT *)&t_above; - tl = (ENTROPY_CONTEXT *)&t_left; - ta_b = (ENTROPY_CONTEXT *)&t_above_b; - tl_b = (ENTROPY_CONTEXT *)&t_left_b; + vp8_zero(t_above_b); + vp8_zero(t_left_b); br = 0; bd = 0; @@ -1151,13 +1146,13 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, mode_selected = this_mode; best_label_rd = this_rd; - memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES)); - memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES)); + memcpy(&t_above_b, &t_above_s, sizeof(ENTROPY_CONTEXT_PLANES)); + memcpy(&t_left_b, &t_left_s, sizeof(ENTROPY_CONTEXT_PLANES)); } } /*for each 4x4 mode*/ - memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES)); - memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES)); + memcpy(&t_above, &t_above_b, sizeof(ENTROPY_CONTEXT_PLANES)); + memcpy(&t_left, &t_left_b, sizeof(ENTROPY_CONTEXT_PLANES)); labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected], bsi->ref_mv, x->mvcost); diff --git a/libvpx/vp8/encoder/rdopt.h b/libvpx/vp8/encoder/rdopt.h index 8186ff105..960bd8f1c 100644 --- a/libvpx/vp8/encoder/rdopt.h +++ b/libvpx/vp8/encoder/rdopt.h @@ -19,6 +19,9 @@ extern "C" { #define RDCOST(RM, DM, R, D) (((128 + (R) * (RM)) >> 8) + (DM) * (D)) +void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex); +void vp8_auto_select_speed(VP8_COMP *cpi); + static INLINE void insertsortmv(int arr[], int len) { int i, j, k; diff --git a/libvpx/vp8/encoder/temporal_filter.c b/libvpx/vp8/encoder/temporal_filter.c index 1b2f46bb6..0a7d25fb0 100644 --- a/libvpx/vp8/encoder/temporal_filter.c +++ b/libvpx/vp8/encoder/temporal_filter.c @@ -20,6 +20,7 @@ #include "ratectrl.h" #include "vp8/common/quant_common.h" #include "segmentation.h" +#include "temporal_filter.h" #include "vpx_mem/vpx_mem.h" #include "vp8/common/swapyv12buffer.h" #include "vp8/common/threading.h" diff --git a/libvpx/vp8/encoder/temporal_filter.h b/libvpx/vp8/encoder/temporal_filter.h new file mode 100644 index 000000000..865d909fb --- /dev/null +++ b/libvpx/vp8/encoder/temporal_filter.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_ENCODER_TEMPORAL_FILTER_H_ +#define VP8_ENCODER_TEMPORAL_FILTER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8_COMP; + +void vp8_temporal_filter_prepare_c(struct VP8_COMP *cpi, int distance); + +#ifdef __cplusplus +} +#endif + +#endif // VP8_ENCODER_TEMPORAL_FILTER_H_ diff --git a/libvpx/vp8/encoder/x86/dct_sse2.asm b/libvpx/vp8/encoder/x86/dct_sse2.asm index d06bca592..4d92f0341 100644 --- a/libvpx/vp8/encoder/x86/dct_sse2.asm +++ b/libvpx/vp8/encoder/x86/dct_sse2.asm @@ -60,6 +60,8 @@ ret %endmacro +SECTION .text + ;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch) global sym(vp8_short_fdct4x4_sse2) PRIVATE sym(vp8_short_fdct4x4_sse2): diff --git a/libvpx/vp8/encoder/x86/encodeopt.asm b/libvpx/vp8/encoder/x86/encodeopt.asm index 0297220ee..f6c6aeae7 100644 --- a/libvpx/vp8/encoder/x86/encodeopt.asm +++ b/libvpx/vp8/encoder/x86/encodeopt.asm @@ -11,6 +11,8 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;int vp8_block_error_sse2(short *coeff_ptr, short *dcoef_ptr) global sym(vp8_block_error_sse2) PRIVATE sym(vp8_block_error_sse2): diff --git a/libvpx/vp8/encoder/x86/fwalsh_sse2.asm b/libvpx/vp8/encoder/x86/fwalsh_sse2.asm index f4989279f..b5d5de4a5 100644 --- a/libvpx/vp8/encoder/x86/fwalsh_sse2.asm +++ b/libvpx/vp8/encoder/x86/fwalsh_sse2.asm @@ -11,6 +11,8 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch) global sym(vp8_short_walsh4x4_sse2) PRIVATE sym(vp8_short_walsh4x4_sse2): diff --git a/libvpx/vp8/encoder/x86/quantize_mmx.asm b/libvpx/vp8/encoder/x86/quantize_mmx.asm deleted file mode 100644 index 2864ce16d..000000000 --- a/libvpx/vp8/encoder/x86/quantize_mmx.asm +++ /dev/null @@ -1,286 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, -; short *qcoeff_ptr,short *dequant_ptr, -; short *scan_mask, short *round_ptr, -; short *quant_ptr, short *dqcoeff_ptr); -global sym(vp8_fast_quantize_b_impl_mmx) PRIVATE -sym(vp8_fast_quantize_b_impl_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - push rsi - push rdi - ; end prolog - - - mov rsi, arg(0) ;coeff_ptr - movq mm0, [rsi] - - mov rax, arg(1) ;zbin_ptr - movq mm1, [rax] - - movq mm3, mm0 - psraw mm0, 15 - - pxor mm3, mm0 - psubw mm3, mm0 ; abs - - movq mm2, mm3 - pcmpgtw mm1, mm2 - - pandn mm1, mm2 - movq mm3, mm1 - - mov rdx, arg(6) ;quant_ptr - movq mm1, [rdx] - - mov rcx, arg(5) ;round_ptr - movq mm2, [rcx] - - paddw mm3, mm2 - pmulhuw mm3, mm1 - - pxor mm3, mm0 - psubw mm3, mm0 ;gain the sign back - - mov rdi, arg(2) ;qcoeff_ptr - movq mm0, mm3 - - movq [rdi], mm3 - - mov rax, arg(3) ;dequant_ptr - movq mm2, [rax] - - pmullw mm3, mm2 - mov rax, arg(7) ;dqcoeff_ptr - - movq [rax], mm3 - - ; next 8 - movq mm4, [rsi+8] - - mov rax, arg(1) ;zbin_ptr - movq mm5, [rax+8] - - movq mm7, mm4 - psraw mm4, 15 - - pxor mm7, mm4 - psubw mm7, mm4 ; abs - - movq mm6, mm7 - pcmpgtw mm5, mm6 - - pandn mm5, mm6 - movq mm7, mm5 - - movq mm5, [rdx+8] - movq mm6, [rcx+8] - - paddw mm7, mm6 - pmulhuw mm7, mm5 - - pxor mm7, mm4 - psubw mm7, mm4;gain the sign back - - mov rdi, arg(2) ;qcoeff_ptr - - movq mm1, mm7 - movq [rdi+8], mm7 - - mov rax, arg(3) ;dequant_ptr - movq mm6, [rax+8] - - pmullw mm7, mm6 - mov rax, arg(7) ;dqcoeff_ptr - - movq [rax+8], mm7 - - - ; next 8 - movq mm4, [rsi+16] - - mov rax, arg(1) ;zbin_ptr - movq mm5, [rax+16] - - movq mm7, mm4 - psraw mm4, 15 - - pxor mm7, mm4 - psubw mm7, mm4 ; abs - - movq mm6, mm7 - pcmpgtw mm5, mm6 - - pandn mm5, mm6 - movq mm7, mm5 - - movq mm5, [rdx+16] - movq mm6, [rcx+16] - - paddw mm7, mm6 - pmulhuw mm7, mm5 - - pxor mm7, mm4 - psubw mm7, mm4;gain the sign back - - mov rdi, arg(2) ;qcoeff_ptr - - movq mm1, mm7 - movq [rdi+16], mm7 - - mov rax, arg(3) ;dequant_ptr - movq mm6, [rax+16] - - pmullw mm7, mm6 - mov rax, arg(7) ;dqcoeff_ptr - - movq [rax+16], mm7 - - - ; next 8 - movq mm4, [rsi+24] - - mov rax, arg(1) ;zbin_ptr - movq mm5, [rax+24] - - movq mm7, mm4 - psraw mm4, 15 - - pxor mm7, mm4 - psubw mm7, mm4 ; abs - - movq mm6, mm7 - pcmpgtw mm5, mm6 - - pandn mm5, mm6 - movq mm7, mm5 - - movq mm5, [rdx+24] - movq mm6, [rcx+24] - - paddw mm7, mm6 - pmulhuw mm7, mm5 - - pxor mm7, mm4 - psubw mm7, mm4;gain the sign back - - mov rdi, arg(2) ;qcoeff_ptr - - movq mm1, mm7 - movq [rdi+24], mm7 - - mov rax, arg(3) ;dequant_ptr - movq mm6, [rax+24] - - pmullw mm7, mm6 - mov rax, arg(7) ;dqcoeff_ptr - - movq [rax+24], mm7 - - - - mov rdi, arg(4) ;scan_mask - mov rsi, arg(2) ;qcoeff_ptr - - pxor mm5, mm5 - pxor mm7, mm7 - - movq mm0, [rsi] - movq mm1, [rsi+8] - - movq mm2, [rdi] - movq mm3, [rdi+8]; - - pcmpeqw mm0, mm7 - pcmpeqw mm1, mm7 - - pcmpeqw mm6, mm6 - pxor mm0, mm6 - - pxor mm1, mm6 - psrlw mm0, 15 - - psrlw mm1, 15 - pmaddwd mm0, mm2 - - pmaddwd mm1, mm3 - movq mm5, mm0 - - paddd mm5, mm1 - - movq mm0, [rsi+16] - movq mm1, [rsi+24] - - movq mm2, [rdi+16] - movq mm3, [rdi+24]; - - pcmpeqw mm0, mm7 - pcmpeqw mm1, mm7 - - pcmpeqw mm6, mm6 - pxor mm0, mm6 - - pxor mm1, mm6 - psrlw mm0, 15 - - psrlw mm1, 15 - pmaddwd mm0, mm2 - - pmaddwd mm1, mm3 - paddd mm5, mm0 - - paddd mm5, mm1 - movq mm0, mm5 - - psrlq mm5, 32 - paddd mm0, mm5 - - ; eob adjustment begins here - movq rcx, mm0 - and rcx, 0xffff - - xor rdx, rdx - sub rdx, rcx ; rdx=-rcx - - bsr rax, rcx - inc rax - - sar rdx, 31 - and rax, rdx - ; Substitute the sse assembly for the old mmx mixed assembly/C. The - ; following is kept as reference - ; movq rcx, mm0 - ; bsr rax, rcx - ; - ; mov eob, rax - ; mov eee, rcx - ; - ;if(eee==0) - ;{ - ; eob=-1; - ;} - ;else if(eee<0) - ;{ - ; eob=15; - ;} - ;d->eob = eob+1; - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret diff --git a/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm index bd92b398a..d2b4711b8 100644 --- a/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm +++ b/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm @@ -11,6 +11,8 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ; void vp8_temporal_filter_apply_sse2 | arg ; (unsigned char *frame1, | 0 ; unsigned int stride, | 1 @@ -203,5 +205,5 @@ align 16 _const_top_bit: times 8 dw 1<<15 align 16 -_const_16w +_const_16w: times 8 dw 16 diff --git a/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c b/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c deleted file mode 100644 index 4406dd0cc..000000000 --- a/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_config.h" -#include "vp8_rtcd.h" -#include "vpx_ports/x86.h" -#include "vp8/encoder/block.h" - -int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, - short *qcoeff_ptr, short *dequant_ptr, - const short *scan_mask, short *round_ptr, - short *quant_ptr, short *dqcoeff_ptr); -void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) { - const short *scan_mask = vp8_default_zig_zag_mask; - short *coeff_ptr = b->coeff; - short *zbin_ptr = b->zbin; - short *round_ptr = b->round; - short *quant_ptr = b->quant_fast; - short *qcoeff_ptr = d->qcoeff; - short *dqcoeff_ptr = d->dqcoeff; - short *dequant_ptr = d->dequant; - - *d->eob = (char)vp8_fast_quantize_b_impl_mmx( - coeff_ptr, zbin_ptr, qcoeff_ptr, dequant_ptr, scan_mask, - - round_ptr, quant_ptr, dqcoeff_ptr); -} diff --git a/libvpx/vp8/encoder/x86/quantize_ssse3.c b/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c index 322f0a151..d54745015 100644 --- a/libvpx/vp8/encoder/x86/quantize_ssse3.c +++ b/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c @@ -10,6 +10,7 @@ #include <tmmintrin.h> /* SSSE3 */ +#include "./vp8_rtcd.h" #include "vp8/encoder/block.h" /* bitscan reverse (bsr) */ diff --git a/libvpx/vp8/vp8_common.mk b/libvpx/vp8/vp8_common.mk index 137f5bb62..246fe6a67 100644 --- a/libvpx/vp8/vp8_common.mk +++ b/libvpx/vp8/vp8_common.mk @@ -116,6 +116,14 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/loopfilter_filters_msa.c VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h +# common (c) +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/sixtap_filter_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/loopfilter_filters_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idctllm_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/dequantize_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/copymem_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idct_blk_mmi.c + ifeq ($(CONFIG_POSTPROC),yes) VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c endif diff --git a/libvpx/vp8/vp8_cx_iface.c b/libvpx/vp8/vp8_cx_iface.c index f8475ed61..af6689fd9 100644 --- a/libvpx/vp8/vp8_cx_iface.c +++ b/libvpx/vp8/vp8_cx_iface.c @@ -1216,6 +1216,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = { 50, /* rc_two_pass_vbrbias */ 0, /* rc_two_pass_vbrmin_section */ 400, /* rc_two_pass_vbrmax_section */ + 0, // rc_2pass_vbr_corpus_complexity (only has meaningfull for VP9) /* keyframing settings (kf) */ VPX_KF_AUTO, /* g_kfmode*/ diff --git a/libvpx/vp8/vp8_dx_iface.c b/libvpx/vp8/vp8_dx_iface.c index 9ea9c7f04..f20283c1e 100644 --- a/libvpx/vp8/vp8_dx_iface.c +++ b/libvpx/vp8/vp8_dx_iface.c @@ -144,8 +144,7 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data, } si->is_kf = 0; - if (data_sz >= 10 && !(clear[0] & 0x01)) /* I-Frame */ - { + if (data_sz >= 10 && !(clear[0] & 0x01)) { /* I-Frame */ si->is_kf = 1; /* vet via sync code */ @@ -228,7 +227,8 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, } static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data, - unsigned int data_sz, vpx_codec_err_t *res) { + unsigned int data_sz, + volatile vpx_codec_err_t *res) { *res = VPX_CODEC_OK; if (ctx->fragments.count == 0) { @@ -267,7 +267,7 @@ static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data, static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, const uint8_t *data, unsigned int data_sz, void *user_priv, long deadline) { - vpx_codec_err_t res = VPX_CODEC_OK; + volatile vpx_codec_err_t res; unsigned int resolution_change = 0; unsigned int w, h; @@ -414,7 +414,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, #endif #if CONFIG_MULTITHREAD - if (pbi->b_multithreaded_rd) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { vp8mt_alloc_temp_buffers(pbi, pc->Width, prev_mb_rows); } #else @@ -580,7 +580,6 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, } } -extern int vp8dx_references_buffer(VP8_COMMON *oci, int ref_frame); static vpx_codec_err_t vp8_get_last_ref_frame(vpx_codec_alg_priv_t *ctx, va_list args) { int *ref_info = va_arg(args, int *); diff --git a/libvpx/vp8/vp8cx.mk b/libvpx/vp8/vp8cx.mk index 7bd41a3fb..0dac0169d 100644 --- a/libvpx/vp8/vp8cx.mk +++ b/libvpx/vp8/vp8cx.mk @@ -30,6 +30,7 @@ VP8_CX_SRCS-yes += encoder/encodeintra.c VP8_CX_SRCS-yes += encoder/encodemb.c VP8_CX_SRCS-yes += encoder/encodemv.c VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.c +VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.h VP8_CX_SRCS-yes += encoder/firstpass.c VP8_CX_SRCS-yes += encoder/block.h VP8_CX_SRCS-yes += encoder/boolhuff.h @@ -56,11 +57,14 @@ VP8_CX_SRCS-yes += encoder/modecosts.c VP8_CX_SRCS-yes += encoder/onyx_if.c VP8_CX_SRCS-yes += encoder/pickinter.c VP8_CX_SRCS-yes += encoder/picklpf.c +VP8_CX_SRCS-yes += encoder/picklpf.h VP8_CX_SRCS-yes += encoder/vp8_quantize.c VP8_CX_SRCS-yes += encoder/ratectrl.c VP8_CX_SRCS-yes += encoder/rdopt.c VP8_CX_SRCS-yes += encoder/segmentation.c VP8_CX_SRCS-yes += encoder/segmentation.h +VP8_CX_SRCS-yes += common/vp8_skin_detection.c +VP8_CX_SRCS-yes += common/vp8_skin_detection.h VP8_CX_SRCS-yes += encoder/tokenize.c VP8_CX_SRCS-yes += encoder/dct_value_cost.h VP8_CX_SRCS-yes += encoder/dct_value_tokens.h @@ -68,19 +72,20 @@ VP8_CX_SRCS-yes += encoder/treewriter.c VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c VP8_CX_SRCS-yes += encoder/temporal_filter.c +VP8_CX_SRCS-yes += encoder/temporal_filter.h VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h ifeq ($(CONFIG_REALTIME_ONLY),yes) VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c +VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.h endif -VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_quantize_sse2.c -VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.c +VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp8_quantize_ssse3.c VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.c ifeq ($(CONFIG_TEMPORAL_DENOISING),yes) @@ -89,7 +94,6 @@ endif VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c -VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm ifeq ($(CONFIG_REALTIME_ONLY),yes) @@ -106,6 +110,9 @@ VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c +VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/vp8_quantize_mmi.c +VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/dct_mmi.c + ifeq ($(CONFIG_TEMPORAL_DENOISING),yes) VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/denoising_msa.c endif diff --git a/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c index dd1ea03b6..025254c3f 100644 --- a/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c +++ b/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c @@ -14,14 +14,7 @@ #include "./vp9_rtcd.h" #include "./vpx_config.h" #include "vp9/common/vp9_common.h" - -static int16_t sinpi_1_9 = 0x14a3; -static int16_t sinpi_2_9 = 0x26c9; -static int16_t sinpi_3_9 = 0x3441; -static int16_t sinpi_4_9 = 0x3b6c; -static int16_t cospi_8_64 = 0x3b21; -static int16_t cospi_16_64 = 0x2d41; -static int16_t cospi_24_64 = 0x187e; +#include "vpx_dsp/txfm_common.h" static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) { int32x4_t q8s32, q9s32; diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c index 66aa733b9..7345e259b 100644 --- a/libvpx/vp9/common/vp9_alloccommon.c +++ b/libvpx/vp9/common/vp9_alloccommon.c @@ -17,24 +17,6 @@ #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_onyxc_int.h" -// TODO(hkuang): Don't need to lock the whole pool after implementing atomic -// frame reference count. -void lock_buffer_pool(BufferPool *const pool) { -#if CONFIG_MULTITHREAD - pthread_mutex_lock(&pool->pool_mutex); -#else - (void)pool; -#endif -} - -void unlock_buffer_pool(BufferPool *const pool) { -#if CONFIG_MULTITHREAD - pthread_mutex_unlock(&pool->pool_mutex); -#else - (void)pool; -#endif -} - void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) { const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2); const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2); @@ -62,8 +44,7 @@ static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) { cm->prev_seg_map_idx = 1; cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx]; - if (!cm->frame_parallel_decode) - cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx]; + cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx]; return 0; } @@ -77,20 +58,18 @@ static void free_seg_map(VP9_COMMON *cm) { } cm->current_frame_seg_map = NULL; - - if (!cm->frame_parallel_decode) { - cm->last_frame_seg_map = NULL; - } + cm->last_frame_seg_map = NULL; } void vp9_free_ref_frame_buffers(BufferPool *pool) { int i; for (i = 0; i < FRAME_BUFFERS; ++i) { - if (pool->frame_bufs[i].ref_count > 0 && + if (!pool->frame_bufs[i].released && pool->frame_bufs[i].raw_frame_buffer.data != NULL) { pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer); pool->frame_bufs[i].ref_count = 0; + pool->frame_bufs[i].released = 1; } vpx_free(pool->frame_bufs[i].mvs); pool->frame_bufs[i].mvs = NULL; @@ -176,6 +155,9 @@ fail: } void vp9_remove_common(VP9_COMMON *cm) { +#if CONFIG_VP9_POSTPROC + vp9_free_postproc_buffers(cm); +#endif vp9_free_context_buffers(cm); vpx_free(cm->fc); @@ -186,7 +168,7 @@ void vp9_remove_common(VP9_COMMON *cm) { void vp9_init_context_buffers(VP9_COMMON *cm) { cm->setup_mi(cm); - if (cm->last_frame_seg_map && !cm->frame_parallel_decode) + if (cm->last_frame_seg_map) memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols); } diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c index bcb9e8f29..47cd63e94 100644 --- a/libvpx/vp9/common/vp9_entropymode.c +++ b/libvpx/vp9/common/vp9_entropymode.c @@ -428,7 +428,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) { vp9_clearall_segfeatures(&cm->seg); cm->seg.abs_delta = SEGMENT_DELTADATA; - if (cm->last_frame_seg_map && !cm->frame_parallel_decode) + if (cm->last_frame_seg_map) memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols)); if (cm->current_frame_seg_map) @@ -457,7 +457,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) { } // prev_mip will only be allocated in encoder. - if (frame_is_intra_only(cm) && cm->prev_mip && !cm->frame_parallel_decode) + if (frame_is_intra_only(cm) && cm->prev_mip) memset(cm->prev_mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip)); diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c index ef0297dd5..c7c343aed 100644 --- a/libvpx/vp9/common/vp9_loopfilter.c +++ b/libvpx/vp9/common/vp9_loopfilter.c @@ -1612,12 +1612,14 @@ void vp9_loop_filter_data_reset( void vp9_reset_lfm(VP9_COMMON *const cm) { if (cm->lf.filter_level) { - memset(cm->lf.lfm, 0, ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * - cm->lf.lfm_stride * sizeof(*cm->lf.lfm)); + memset(cm->lf.lfm, 0, + ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride * + sizeof(*cm->lf.lfm)); } } -int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) { +int vp9_loop_filter_worker(void *arg1, void *unused) { + LFWorkerData *const lf_data = (LFWorkerData *)arg1; (void)unused; loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->start, lf_data->stop, lf_data->y_only); diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h index da37a6ebd..481a6cdc6 100644 --- a/libvpx/vp9/common/vp9_loopfilter.h +++ b/libvpx/vp9/common/vp9_loopfilter.h @@ -151,8 +151,8 @@ void vp9_loop_filter_data_reset( LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer, struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]); -// Operates on the rows described by 'lf_data'. -int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused); +// Operates on the rows described by 'arg1' (cast to LFWorkerData *). +int vp9_loop_filter_worker(void *arg1, void *unused); #ifdef __cplusplus } // extern "C" #endif diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h index 32db7b7aa..1d96d92c2 100644 --- a/libvpx/vp9/common/vp9_onyxc_int.h +++ b/libvpx/vp9/common/vp9_onyxc_int.h @@ -37,13 +37,10 @@ extern "C" { #define REF_FRAMES_LOG2 3 #define REF_FRAMES (1 << REF_FRAMES_LOG2) -// 4 scratch frames for the new frames to support a maximum of 4 cores decoding -// in parallel, 3 for scaled references on the encoder. -// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number -// of framebuffers. +// 1 scratch frame for the new frame, 3 for scaled references on the encoder. // TODO(jkoleszar): These 3 extra references could probably come from the // normal reference pool. -#define FRAME_BUFFERS (REF_FRAMES + 7) +#define FRAME_BUFFERS (REF_FRAMES + 4) #define FRAME_CONTEXTS_LOG2 2 #define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2) @@ -72,30 +69,12 @@ typedef struct { MV_REF *mvs; int mi_rows; int mi_cols; + uint8_t released; vpx_codec_frame_buffer_t raw_frame_buffer; YV12_BUFFER_CONFIG buf; - - // The Following variables will only be used in frame parallel decode. - - // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means - // that no FrameWorker owns, or is decoding, this buffer. - VPxWorker *frame_worker_owner; - - // row and col indicate which position frame has been decoded to in real - // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX - // when the frame is fully decoded. - int row; - int col; } RefCntBuffer; typedef struct BufferPool { -// Protect BufferPool from being accessed by several FrameWorkers at -// the same time during frame parallel decode. -// TODO(hkuang): Try to use atomic variable instead of locking the whole pool. -#if CONFIG_MULTITHREAD - pthread_mutex_t pool_mutex; -#endif - // Private data associated with the frame buffer callbacks. void *cb_priv; @@ -235,10 +214,6 @@ typedef struct VP9Common { struct loopfilter lf; struct segmentation seg; - // TODO(hkuang): Remove this as it is the same as frame_parallel_decode - // in pbi. - int frame_parallel_decode; // frame-based threading. - // Context probabilities for reference frame prediction MV_REFERENCE_FRAME comp_fixed_ref; MV_REFERENCE_FRAME comp_var_ref[2]; @@ -283,11 +258,6 @@ typedef struct VP9Common { int above_context_alloc_cols; } VP9_COMMON; -// TODO(hkuang): Don't need to lock the whole pool after implementing atomic -// frame reference count. -void lock_buffer_pool(BufferPool *const pool); -void unlock_buffer_pool(BufferPool *const pool); - static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) { if (index < 0 || index >= REF_FRAMES) return NULL; if (cm->ref_frame_map[index] < 0) return NULL; @@ -303,7 +273,6 @@ static INLINE int get_free_fb(VP9_COMMON *cm) { RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; int i; - lock_buffer_pool(cm->buffer_pool); for (i = 0; i < FRAME_BUFFERS; ++i) if (frame_bufs[i].ref_count == 0) break; @@ -314,7 +283,6 @@ static INLINE int get_free_fb(VP9_COMMON *cm) { i = INVALID_IDX; } - unlock_buffer_pool(cm->buffer_pool); return i; } @@ -342,7 +310,7 @@ static INLINE void set_partition_probs(const VP9_COMMON *const cm, xd->partition_probs = frame_is_intra_only(cm) ? &vp9_kf_partition_probs[0] - : (const vpx_prob(*)[PARTITION_TYPES - 1])cm->fc->partition_prob; + : (const vpx_prob(*)[PARTITION_TYPES - 1]) cm->fc->partition_prob; } static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd, diff --git a/libvpx/vp9/common/vp9_postproc.c b/libvpx/vp9/common/vp9_postproc.c index b105e5d45..dfc315eea 100644 --- a/libvpx/vp9/common/vp9_postproc.c +++ b/libvpx/vp9/common/vp9_postproc.c @@ -380,7 +380,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, // if mfqe is enabled. Need to take both the quality and the speed // into consideration. if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) { - vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int); + vpx_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int); } if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) { deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf, @@ -390,7 +390,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q, cm->postproc_state.limits); } else { - vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf); + vpx_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf); } } else if (flags & VP9D_DEMACROBLOCK) { deblock_and_de_macro_block(cm->frame_to_show, ppbuf, @@ -399,7 +399,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, } else if (flags & VP9D_DEBLOCK) { vp9_deblock(cm->frame_to_show, ppbuf, q, cm->postproc_state.limits); } else { - vp8_yv12_copy_frame(cm->frame_to_show, ppbuf); + vpx_yv12_copy_frame(cm->frame_to_show, ppbuf); } ppstate->last_base_qindex = cm->base_qindex; diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h index 1b09b380d..bb9291a26 100644 --- a/libvpx/vp9/common/vp9_reconinter.h +++ b/libvpx/vp9/common/vp9_reconinter.h @@ -26,9 +26,9 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride, const struct scale_factors *sf, int w, int h, int ref, const InterpKernel *kernel, int xs, int ys) { - sf->predict[subpel_x != 0][subpel_y != 0][ref]( - src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y], - ys, w, h); + sf->predict[subpel_x != 0][subpel_y != 0][ref](src, src_stride, dst, + dst_stride, kernel, subpel_x, + xs, subpel_y, ys, w, h); } #if CONFIG_VP9_HIGHBITDEPTH @@ -37,8 +37,8 @@ static INLINE void highbd_inter_predictor( const int subpel_x, const int subpel_y, const struct scale_factors *sf, int w, int h, int ref, const InterpKernel *kernel, int xs, int ys, int bd) { sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref]( - src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y], - ys, w, h, bd); + src, src_stride, dst, dst_stride, kernel, subpel_x, xs, subpel_y, ys, w, + h, bd); } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/vp9/common/vp9_rtcd_defs.pl b/libvpx/vp9/common/vp9_rtcd_defs.pl index baf63e97f..22b67ecac 100644 --- a/libvpx/vp9/common/vp9_rtcd_defs.pl +++ b/libvpx/vp9/common/vp9_rtcd_defs.pl @@ -1,3 +1,13 @@ +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + sub vp9_common_forward_decls() { print <<EOF /* @@ -30,6 +40,7 @@ if ($opts{arch} eq "x86_64") { $ssse3_x86_64 = 'ssse3'; $avx_x86_64 = 'avx'; $avx2_x86_64 = 'avx2'; + $avx512_x86_64 = 'avx512'; } # @@ -46,41 +57,24 @@ specialize qw/vp9_filter_by_weight8x8 sse2 msa/; # # dct # -if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - # Force C versions if CONFIG_EMULATE_HARDWARE is 1 - if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") { - add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; - - add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; - - add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; - } else { - add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; - specialize qw/vp9_iht4x4_16_add sse2/; - - add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; - specialize qw/vp9_iht8x8_64_add sse2/; - - add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; - specialize qw/vp9_iht16x16_256_add sse2/; - } -} else { - # Force C versions if CONFIG_EMULATE_HARDWARE is 1 - if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") { - add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; - - add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; - - add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; - } else { - add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; - specialize qw/vp9_iht4x4_16_add sse2 neon dspr2 msa/; - - add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; - specialize qw/vp9_iht8x8_64_add sse2 neon dspr2 msa/; - - add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; - specialize qw/vp9_iht16x16_256_add sse2 dspr2 msa/; +# Force C versions if CONFIG_EMULATE_HARDWARE is 1 +add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; + +add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; + +add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; + +if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { + # Note that there are more specializations appended when + # CONFIG_VP9_HIGHBITDEPTH is off. + specialize qw/vp9_iht4x4_16_add sse2/; + specialize qw/vp9_iht8x8_64_add sse2/; + specialize qw/vp9_iht16x16_256_add sse2/; + if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { + # Note that these specializations are appended to the above ones. + specialize qw/vp9_iht4x4_16_add neon dspr2 msa/; + specialize qw/vp9_iht8x8_64_add neon dspr2 msa/; + specialize qw/vp9_iht16x16_256_add dspr2 msa/; } } @@ -124,82 +118,69 @@ if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") { specialize qw/vp9_denoiser_filter neon sse2/; } -if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; - specialize qw/vp9_block_error avx2 sse2/; +add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; - add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; - specialize qw/vp9_highbd_block_error sse2/; +add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size"; - add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size"; - specialize qw/vp9_block_error_fp sse2/; +add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; +specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64"; - add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64"; +add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; +specialize qw/vp9_quantize_fp_32x32 neon/, "$ssse3_x86_64"; - add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64"; +add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + +if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + specialize qw/vp9_block_error avx2 sse2/; + + specialize qw/vp9_block_error_fp avx2 sse2/; - add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_fdct8x8_quant neon ssse3/; + + add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; + specialize qw/vp9_highbd_block_error sse2/; } else { - add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; specialize qw/vp9_block_error avx2 msa sse2/; - add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size"; - specialize qw/vp9_block_error_fp neon sse2/; - - add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64"; + specialize qw/vp9_block_error_fp neon avx2 sse2/; - add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64"; - - add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/; } # fdct functions -if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_fht4x4 sse2/; - - add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_fht8x8 sse2/; +add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_fht16x16 sse2/; +add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fwht4x4 sse2/; -} else { - add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_fht4x4 sse2 msa/; +add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_fht8x8 sse2 msa/; +add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; - add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_fht16x16 sse2 msa/; - - add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fwht4x4 msa sse2/; +# Note that there are more specializations appended when CONFIG_VP9_HIGHBITDEPTH +# is off. +specialize qw/vp9_fht4x4 sse2/; +specialize qw/vp9_fht8x8 sse2/; +specialize qw/vp9_fht16x16 sse2/; +specialize qw/vp9_fwht4x4 sse2/; +if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { + # Note that these specializations are appended to the above ones. + specialize qw/vp9_fht4x4 msa/; + specialize qw/vp9_fht8x8 msa/; + specialize qw/vp9_fht16x16 msa/; + specialize qw/vp9_fwht4x4 msa/; } # # Motion search # -add_proto qw/int vp9_full_search_sad/, "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv"; -specialize qw/vp9_full_search_sad sse3 sse4_1/; -$vp9_full_search_sad_sse3=vp9_full_search_sadx3; -$vp9_full_search_sad_sse4_1=vp9_full_search_sadx8; - add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; specialize qw/vp9_diamond_search_sad avx/; +if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") { add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count"; specialize qw/vp9_temporal_filter_apply sse4_1/; +} if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { @@ -227,7 +208,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # frame based scale # add_proto qw/void vp9_scale_and_extend_frame/, "const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler"; -specialize qw/vp9_scale_and_extend_frame ssse3/; +specialize qw/vp9_scale_and_extend_frame neon ssse3/; } # end encoder functions diff --git a/libvpx/vp9/common/vp9_thread_common.c b/libvpx/vp9/common/vp9_thread_common.c index 07e659d23..8d44e91f2 100644 --- a/libvpx/vp9/common/vp9_thread_common.c +++ b/libvpx/vp9/common/vp9_thread_common.c @@ -140,8 +140,9 @@ static INLINE void thread_loop_filter_rows( } // Row-based multi-threaded loopfilter hook -static int loop_filter_row_worker(VP9LfSync *const lf_sync, - LFWorkerData *const lf_data) { +static int loop_filter_row_worker(void *arg1, void *arg2) { + VP9LfSync *const lf_sync = (VP9LfSync *)arg1; + LFWorkerData *const lf_data = (LFWorkerData *)arg2; thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->start, lf_data->stop, lf_data->y_only, lf_sync); @@ -183,7 +184,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm, VPxWorker *const worker = &workers[i]; LFWorkerData *const lf_data = &lf_sync->lfdata[i]; - worker->hook = (VPxWorkerHook)loop_filter_row_worker; + worker->hook = loop_filter_row_worker; worker->data1 = lf_sync; worker->data2 = lf_data; diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c index bb2dcf52b..6996260e2 100644 --- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -18,8 +18,8 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, __m128i in[2]; const __m128i eight = _mm_set1_epi16(8); - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8); + in[0] = load_input_data8(input); + in[1] = load_input_data8(input + 8); switch (tx_type) { case 0: // DCT_DCT @@ -54,18 +54,17 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in[8]; - const __m128i zero = _mm_setzero_si128(); const __m128i final_rounding = _mm_set1_epi16(1 << 4); // load input data - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8 * 1); - in[2] = load_input_data(input + 8 * 2); - in[3] = load_input_data(input + 8 * 3); - in[4] = load_input_data(input + 8 * 4); - in[5] = load_input_data(input + 8 * 5); - in[6] = load_input_data(input + 8 * 6); - in[7] = load_input_data(input + 8 * 7); + in[0] = load_input_data8(input); + in[1] = load_input_data8(input + 8 * 1); + in[2] = load_input_data8(input + 8 * 2); + in[3] = load_input_data8(input + 8 * 3); + in[4] = load_input_data8(input + 8 * 4); + in[5] = load_input_data8(input + 8 * 5); + in[6] = load_input_data8(input + 8 * 6); + in[7] = load_input_data8(input + 8 * 7); switch (tx_type) { case 0: // DCT_DCT @@ -106,14 +105,91 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, in[6] = _mm_srai_epi16(in[6], 5); in[7] = _mm_srai_epi16(in[7], 5); - RECON_AND_STORE(dest + 0 * stride, in[0]); - RECON_AND_STORE(dest + 1 * stride, in[1]); - RECON_AND_STORE(dest + 2 * stride, in[2]); - RECON_AND_STORE(dest + 3 * stride, in[3]); - RECON_AND_STORE(dest + 4 * stride, in[4]); - RECON_AND_STORE(dest + 5 * stride, in[5]); - RECON_AND_STORE(dest + 6 * stride, in[6]); - RECON_AND_STORE(dest + 7 * stride, in[7]); + recon_and_store(dest + 0 * stride, in[0]); + recon_and_store(dest + 1 * stride, in[1]); + recon_and_store(dest + 2 * stride, in[2]); + recon_and_store(dest + 3 * stride, in[3]); + recon_and_store(dest + 4 * stride, in[4]); + recon_and_store(dest + 5 * stride, in[5]); + recon_and_store(dest + 6 * stride, in[6]); + recon_and_store(dest + 7 * stride, in[7]); +} + +static INLINE void load_buffer_8x16(const tran_low_t *const input, + __m128i *const in) { + in[0] = load_input_data8(input + 0 * 16); + in[1] = load_input_data8(input + 1 * 16); + in[2] = load_input_data8(input + 2 * 16); + in[3] = load_input_data8(input + 3 * 16); + in[4] = load_input_data8(input + 4 * 16); + in[5] = load_input_data8(input + 5 * 16); + in[6] = load_input_data8(input + 6 * 16); + in[7] = load_input_data8(input + 7 * 16); + + in[8] = load_input_data8(input + 8 * 16); + in[9] = load_input_data8(input + 9 * 16); + in[10] = load_input_data8(input + 10 * 16); + in[11] = load_input_data8(input + 11 * 16); + in[12] = load_input_data8(input + 12 * 16); + in[13] = load_input_data8(input + 13 * 16); + in[14] = load_input_data8(input + 14 * 16); + in[15] = load_input_data8(input + 15 * 16); +} + +static INLINE void write_buffer_8x16(uint8_t *const dest, __m128i *const in, + const int stride) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + + recon_and_store(dest + 0 * stride, in[0]); + recon_and_store(dest + 1 * stride, in[1]); + recon_and_store(dest + 2 * stride, in[2]); + recon_and_store(dest + 3 * stride, in[3]); + recon_and_store(dest + 4 * stride, in[4]); + recon_and_store(dest + 5 * stride, in[5]); + recon_and_store(dest + 6 * stride, in[6]); + recon_and_store(dest + 7 * stride, in[7]); + recon_and_store(dest + 8 * stride, in[8]); + recon_and_store(dest + 9 * stride, in[9]); + recon_and_store(dest + 10 * stride, in[10]); + recon_and_store(dest + 11 * stride, in[11]); + recon_and_store(dest + 12 * stride, in[12]); + recon_and_store(dest + 13 * stride, in[13]); + recon_and_store(dest + 14 * stride, in[14]); + recon_and_store(dest + 15 * stride, in[15]); } void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, diff --git a/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm b/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm index 30852049b..ca0897ab9 100644 --- a/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm +++ b/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm @@ -12,6 +12,8 @@ ; TODO(jackychen): Find a way to fix the duplicate. %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;void vp9_filter_by_weight16x16_sse2 ;( ; unsigned char *src, diff --git a/libvpx/vp9/decoder/vp9_decodeframe.c b/libvpx/vp9/decoder/vp9_decodeframe.c index 0760f8c23..d0e896c13 100644 --- a/libvpx/vp9/decoder/vp9_decodeframe.c +++ b/libvpx/vp9/decoder/vp9_decodeframe.c @@ -490,8 +490,8 @@ static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride, #endif // CONFIG_VP9_HIGHBITDEPTH static void dec_build_inter_predictors( - VPxWorker *const worker, MACROBLOCKD *xd, int plane, int bw, int bh, int x, - int y, int w, int h, int mi_x, int mi_y, const InterpKernel *kernel, + MACROBLOCKD *xd, int plane, int bw, int bh, int x, int y, int w, int h, + int mi_x, int mi_y, const InterpKernel *kernel, const struct scale_factors *sf, struct buf_2d *pre_buf, struct buf_2d *dst_buf, const MV *mv, RefCntBuffer *ref_frame_buf, int is_scaled, int ref) { @@ -593,12 +593,6 @@ static void dec_build_inter_predictors( y_pad = 1; } - // Wait until reference block is ready. Pad 7 more pixels as last 7 - // pixels of each superblock row can be changed by next superblock row. - if (worker != NULL) - vp9_frameworker_wait(worker, ref_frame_buf, VPXMAX(0, (y1 + 7)) - << (plane == 0 ? 0 : 1)); - // Skip border extension if block is inside the frame. if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 || y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) { @@ -617,14 +611,6 @@ static void dec_build_inter_predictors( w, h, ref, xs, ys); return; } - } else { - // Wait until reference block is ready. Pad 7 more pixels as last 7 - // pixels of each superblock row can be changed by next superblock row. - if (worker != NULL) { - const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS; - vp9_frameworker_wait(worker, ref_frame_buf, VPXMAX(0, (y1 + 7)) - << (plane == 0 ? 0 : 1)); - } } #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { @@ -653,8 +639,6 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi, const int is_compound = has_second_ref(mi); int ref; int is_scaled; - VPxWorker *const fwo = - pbi->frame_parallel_decode ? pbi->frame_worker_owner : NULL; for (ref = 0; ref < 1 + is_compound; ++ref) { const MV_REFERENCE_FRAME frame = mi->ref_frame[ref]; @@ -686,10 +670,10 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi, for (y = 0; y < num_4x4_h; ++y) { for (x = 0; x < num_4x4_w; ++x) { const MV mv = average_split_mvs(pd, mi, ref, i++); - dec_build_inter_predictors(fwo, xd, plane, n4w_x4, n4h_x4, 4 * x, - 4 * y, 4, 4, mi_x, mi_y, kernel, sf, - pre_buf, dst_buf, &mv, ref_frame_buf, - is_scaled, ref); + dec_build_inter_predictors(xd, plane, n4w_x4, n4h_x4, 4 * x, 4 * y, + 4, 4, mi_x, mi_y, kernel, sf, pre_buf, + dst_buf, &mv, ref_frame_buf, is_scaled, + ref); } } } @@ -703,7 +687,7 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi, const int n4w_x4 = 4 * num_4x4_w; const int n4h_x4 = 4 * num_4x4_h; struct buf_2d *const pre_buf = &pd->pre[ref]; - dec_build_inter_predictors(fwo, xd, plane, n4w_x4, n4h_x4, 0, 0, n4w_x4, + dec_build_inter_predictors(xd, plane, n4w_x4, n4h_x4, 0, 0, n4w_x4, n4h_x4, mi_x, mi_y, kernel, sf, pre_buf, dst_buf, &mv, ref_frame_buf, is_scaled, ref); } @@ -1187,7 +1171,6 @@ static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) { resize_context_buffers(cm, width, height); setup_render_size(cm, rb); - lock_buffer_pool(pool); if (vpx_realloc_frame_buffer( get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, @@ -1197,12 +1180,11 @@ static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) { VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment, &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) { - unlock_buffer_pool(pool); vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } - unlock_buffer_pool(pool); + pool->frame_bufs[cm->new_fb_idx].released = 0; pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x; pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y; pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth; @@ -1273,7 +1255,6 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, resize_context_buffers(cm, width, height); setup_render_size(cm, rb); - lock_buffer_pool(pool); if (vpx_realloc_frame_buffer( get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, @@ -1283,12 +1264,11 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment, &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) { - unlock_buffer_pool(pool); vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } - unlock_buffer_pool(pool); + pool->frame_bufs[cm->new_fb_idx].released = 0; pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x; pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y; pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth; @@ -1384,7 +1364,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, pbi->lf_worker.data1 == NULL) { CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, vpx_memalign(32, sizeof(LFWorkerData))); - pbi->lf_worker.hook = (VPxWorkerHook)vp9_loop_filter_worker; + pbi->lf_worker.hook = vp9_loop_filter_worker; if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) { vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Loop filter thread creation failed"); @@ -1473,11 +1453,6 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, winterface->execute(&pbi->lf_worker); } } - // After loopfiltering, the last 7 row pixels in each superblock row may - // still be changed by the longest loopfilter of the next superblock - // row. - if (pbi->frame_parallel_decode) - vp9_frameworker_broadcast(pbi->cur_buf, mi_row << MI_BLOCK_SIZE_LOG2); } } @@ -1493,16 +1468,16 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, // Get last tile data. tile_data = pbi->tile_worker_data + tile_cols * tile_rows - 1; - if (pbi->frame_parallel_decode) - vp9_frameworker_broadcast(pbi->cur_buf, INT_MAX); return vpx_reader_find_end(&tile_data->bit_reader); } // On entry 'tile_data->data_end' points to the end of the input frame, on exit // it is updated to reflect the bitreader position of the final tile column if // present in the tile buffer group or NULL otherwise. -static int tile_worker_hook(TileWorkerData *const tile_data, - VP9Decoder *const pbi) { +static int tile_worker_hook(void *arg1, void *arg2) { + TileWorkerData *const tile_data = (TileWorkerData *)arg1; + VP9Decoder *const pbi = (VP9Decoder *)arg2; + TileInfo *volatile tile = &tile_data->xd.tile; const int final_col = (1 << pbi->common.log2_tile_cols) - 1; const uint8_t *volatile bit_reader_end = NULL; @@ -1596,7 +1571,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, tile_data->xd = pbi->mb; tile_data->xd.counts = cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts; - worker->hook = (VPxWorkerHook)tile_worker_hook; + worker->hook = tile_worker_hook; worker->data1 = tile_data; worker->data2 = pbi; } @@ -1779,24 +1754,17 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, if (cm->show_existing_frame) { // Show an existing frame directly. const int frame_to_show = cm->ref_frame_map[vpx_rb_read_literal(rb, 3)]; - lock_buffer_pool(pool); if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) { - unlock_buffer_pool(pool); vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, "Buffer %d does not contain a decoded frame", frame_to_show); } ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show); - unlock_buffer_pool(pool); pbi->refresh_frame_flags = 0; cm->lf.filter_level = 0; cm->show_frame = 1; - if (pbi->frame_parallel_decode) { - for (i = 0; i < REF_FRAMES; ++i) - cm->next_ref_frame_map[i] = cm->ref_frame_map[i]; - } return 0; } @@ -1913,7 +1881,6 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, cm->frame_context_idx = vpx_rb_read_literal(rb, FRAME_CONTEXTS_LOG2); // Generate next_ref_frame_map. - lock_buffer_pool(pool); for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { if (mask & 1) { cm->next_ref_frame_map[ref_index] = cm->new_fb_idx; @@ -1933,7 +1900,6 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, if (cm->ref_frame_map[ref_index] >= 0) ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count; } - unlock_buffer_pool(pool); pbi->hold_ref_buf = 1; if (frame_is_intra_only(cm) || cm->error_resilient_mode) @@ -2090,24 +2056,6 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data, vp9_loop_filter_frame_init(cm, cm->lf.filter_level); } - // If encoded in frame parallel mode, frame context is ready after decoding - // the frame header. - if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) { - VPxWorker *const worker = pbi->frame_worker_owner; - FrameWorkerData *const frame_worker_data = worker->data1; - if (cm->refresh_frame_context) { - context_updated = 1; - cm->frame_contexts[cm->frame_context_idx] = *cm->fc; - } - vp9_frameworker_lock_stats(worker); - pbi->cur_buf->row = -1; - pbi->cur_buf->col = -1; - frame_worker_data->frame_context_ready = 1; - // Signal the main thread that context is ready. - vp9_frameworker_signal_stats(worker); - vp9_frameworker_unlock_stats(worker); - } - if (pbi->tile_worker_data == NULL || (tile_cols * tile_rows) != pbi->total_tiles) { const int num_tile_workers = diff --git a/libvpx/vp9/decoder/vp9_decodemv.c b/libvpx/vp9/decoder/vp9_decodemv.c index 1a4152436..0a781413b 100644 --- a/libvpx/vp9/decoder/vp9_decodemv.c +++ b/libvpx/vp9/decoder/vp9_decodemv.c @@ -455,12 +455,6 @@ static void dec_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *best_mv, } } -static void fpm_sync(void *const data, int mi_row) { - VP9Decoder *const pbi = (VP9Decoder *)data; - vp9_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame, - mi_row << MI_BLOCK_SIZE_LOG2); -} - // This macro is used to add a motion vector mv_ref list if it isn't // already in the list. If it's the second motion vector or early_break // it will also skip all additional processing and jump to Done! @@ -500,8 +494,7 @@ static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, const POSITION *const mv_ref_search, int_mv *mv_ref_list, int mi_row, int mi_col, - int block, int is_sub8x8, find_mv_refs_sync sync, - void *const data) { + int block, int is_sub8x8) { const int *ref_sign_bias = cm->ref_frame_sign_bias; int i, refmv_count = 0; int different_ref_found = 0; @@ -557,23 +550,8 @@ static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, } } -// TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast -// on windows platform. The sync here is unnecessary if use_prev_frame_mvs -// is 0. But after removing it, there will be hang in the unit test on windows -// due to several threads waiting for a thread's signal. -#if defined(_WIN32) && !HAVE_PTHREAD_H - if (cm->frame_parallel_decode && sync != NULL) { - sync(data, mi_row); - } -#endif - // Check the last frame's mode and mv info. if (prev_frame_mvs) { - // Synchronize here for frame parallel decode if sync function is provided. - if (cm->frame_parallel_decode && sync != NULL) { - sync(data, mi_row); - } - if (prev_frame_mvs->ref_frame[0] == ref_frame) { ADD_MV_REF_LIST_EB(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done); } else if (prev_frame_mvs->ref_frame[1] == ref_frame) { @@ -652,7 +630,7 @@ static void append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, refmv_count = dec_find_mv_refs(cm, xd, b_mode, mi->ref_frame[ref], mv_ref_search, - mv_list, mi_row, mi_col, block, 1, NULL, NULL); + mv_list, mi_row, mi_col, block, 1); switch (block) { case 0: best_sub8x8->as_int = mv_list[refmv_count - 1].as_int; break; @@ -750,9 +728,8 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, const MV_REFERENCE_FRAME frame = mi->ref_frame[ref]; int refmv_count; - refmv_count = - dec_find_mv_refs(cm, xd, mi->mode, frame, mv_ref_search, tmp_mvs, - mi_row, mi_col, -1, 0, fpm_sync, (void *)pbi); + refmv_count = dec_find_mv_refs(cm, xd, mi->mode, frame, mv_ref_search, + tmp_mvs, mi_row, mi_col, -1, 0); dec_find_best_ref_mvs(allow_hp, tmp_mvs, &best_ref_mvs[ref], refmv_count); diff --git a/libvpx/vp9/decoder/vp9_decoder.c b/libvpx/vp9/decoder/vp9_decoder.c index 37693f094..a913fa560 100644 --- a/libvpx/vp9/decoder/vp9_decoder.c +++ b/libvpx/vp9/decoder/vp9_decoder.c @@ -139,6 +139,7 @@ void vp9_decoder_remove(VP9Decoder *pbi) { vp9_loop_filter_dealloc(&pbi->lf_row_sync); } + vp9_remove_common(&pbi->common); vpx_free(pbi); } @@ -169,7 +170,7 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9Decoder *pbi, vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Incorrect buffer dimensions"); else - vp8_yv12_copy_frame(cfg, sd); + vpx_yv12_copy_frame(cfg, sd); } else { vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Invalid reference frame"); } @@ -217,7 +218,7 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm, "Incorrect buffer dimensions"); } else { // Overwrite the reference frame buffer. - vp8_yv12_copy_frame(sd, ref_buf); + vpx_yv12_copy_frame(sd, ref_buf); } return cm->error.error_code; @@ -230,7 +231,6 @@ static void swap_frame_buffers(VP9Decoder *pbi) { BufferPool *const pool = cm->buffer_pool; RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; - lock_buffer_pool(pool); for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { const int old_idx = cm->ref_frame_map[ref_index]; // Current thread releases the holding of reference frame. @@ -250,15 +250,10 @@ static void swap_frame_buffers(VP9Decoder *pbi) { decrease_ref_count(old_idx, frame_bufs, pool); cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index]; } - unlock_buffer_pool(pool); pbi->hold_ref_buf = 0; cm->frame_to_show = get_frame_new_buffer(cm); - if (!pbi->frame_parallel_decode || !cm->show_frame) { - lock_buffer_pool(pool); - --frame_bufs[cm->new_fb_idx].ref_count; - unlock_buffer_pool(pool); - } + --frame_bufs[cm->new_fb_idx].ref_count; // Invalidate these references until the next frame starts. for (ref_index = 0; ref_index < 3; ref_index++) @@ -292,11 +287,13 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, pbi->ready_for_new_data = 0; // Check if the previous frame was a frame without any references to it. - // Release frame buffer if not decoding in frame parallel mode. - if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0 && - frame_bufs[cm->new_fb_idx].ref_count == 0) + if (cm->new_fb_idx >= 0 && frame_bufs[cm->new_fb_idx].ref_count == 0 && + !frame_bufs[cm->new_fb_idx].released) { pool->release_fb_cb(pool->cb_priv, &frame_bufs[cm->new_fb_idx].raw_frame_buffer); + frame_bufs[cm->new_fb_idx].released = 1; + } + // Find a free frame buffer. Return error if can not find any. cm->new_fb_idx = get_free_fb(cm); if (cm->new_fb_idx == INVALID_IDX) { @@ -309,18 +306,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; pbi->hold_ref_buf = 0; - if (pbi->frame_parallel_decode) { - VPxWorker *const worker = pbi->frame_worker_owner; - vp9_frameworker_lock_stats(worker); - frame_bufs[cm->new_fb_idx].frame_worker_owner = worker; - // Reset decoding progress. - pbi->cur_buf = &frame_bufs[cm->new_fb_idx]; - pbi->cur_buf->row = -1; - pbi->cur_buf->col = -1; - vp9_frameworker_unlock_stats(worker); - } else { - pbi->cur_buf = &frame_bufs[cm->new_fb_idx]; - } + pbi->cur_buf = &frame_bufs[cm->new_fb_idx]; if (setjmp(cm->error.jmp)) { const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); @@ -336,7 +322,6 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, winterface->sync(&pbi->tile_workers[i]); } - lock_buffer_pool(pool); // Release all the reference buffers if worker thread is holding them. if (pbi->hold_ref_buf == 1) { int ref_index = 0, mask; @@ -361,7 +346,6 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, } // Release current frame. decrease_ref_count(cm->new_fb_idx, frame_bufs, pool); - unlock_buffer_pool(pool); vpx_clear_system_state(); return -1; @@ -377,31 +361,14 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, if (!cm->show_existing_frame) { cm->last_show_frame = cm->show_frame; cm->prev_frame = cm->cur_frame; - if (cm->seg.enabled && !pbi->frame_parallel_decode) - vp9_swap_current_and_last_seg_map(cm); + if (cm->seg.enabled) vp9_swap_current_and_last_seg_map(cm); } // Update progress in frame parallel decode. - if (pbi->frame_parallel_decode) { - // Need to lock the mutex here as another thread may - // be accessing this buffer. - VPxWorker *const worker = pbi->frame_worker_owner; - FrameWorkerData *const frame_worker_data = worker->data1; - vp9_frameworker_lock_stats(worker); - - if (cm->show_frame) { - cm->current_video_frame++; - } - frame_worker_data->frame_decoded = 1; - frame_worker_data->frame_context_ready = 1; - vp9_frameworker_signal_stats(worker); - vp9_frameworker_unlock_stats(worker); - } else { - cm->last_width = cm->width; - cm->last_height = cm->height; - if (cm->show_frame) { - cm->current_video_frame++; - } + cm->last_width = cm->width; + cm->last_height = cm->height; + if (cm->show_frame) { + cm->current_video_frame++; } cm->error.setjmp = 0; diff --git a/libvpx/vp9/decoder/vp9_decoder.h b/libvpx/vp9/decoder/vp9_decoder.h index 427baf1e0..4b26c314d 100644 --- a/libvpx/vp9/decoder/vp9_decoder.h +++ b/libvpx/vp9/decoder/vp9_decoder.h @@ -21,7 +21,6 @@ #include "vp9/common/vp9_thread_common.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_ppflags.h" -#include "vp9/decoder/vp9_dthread.h" #ifdef __cplusplus extern "C" { @@ -53,13 +52,10 @@ typedef struct VP9Decoder { int refresh_frame_flags; - int frame_parallel_decode; // frame-based threading. - // TODO(hkuang): Combine this with cur_buf in macroblockd as they are // the same. RefCntBuffer *cur_buf; // Current decoding frame buffer. - VPxWorker *frame_worker_owner; // frame_worker that owns this pbi. VPxWorker lf_worker; VPxWorker *tile_workers; TileWorkerData *tile_worker_data; @@ -121,9 +117,10 @@ static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs, // But the private buffer is not set up until finish decoding header. // So any error happens during decoding header, the frame_bufs will not // have valid priv buffer. - if (frame_bufs[idx].ref_count == 0 && + if (!frame_bufs[idx].released && frame_bufs[idx].ref_count == 0 && frame_bufs[idx].raw_frame_buffer.priv) { pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer); + frame_bufs[idx].released = 1; } } } diff --git a/libvpx/vp9/decoder/vp9_dthread.c b/libvpx/vp9/decoder/vp9_dthread.c deleted file mode 100644 index 52bc2a0f6..000000000 --- a/libvpx/vp9/decoder/vp9_dthread.c +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_config.h" -#include "vpx_mem/vpx_mem.h" -#include "vp9/common/vp9_reconinter.h" -#include "vp9/decoder/vp9_dthread.h" -#include "vp9/decoder/vp9_decoder.h" - -// #define DEBUG_THREAD - -// TODO(hkuang): Clean up all the #ifdef in this file. -void vp9_frameworker_lock_stats(VPxWorker *const worker) { -#if CONFIG_MULTITHREAD - FrameWorkerData *const worker_data = worker->data1; - pthread_mutex_lock(&worker_data->stats_mutex); -#else - (void)worker; -#endif -} - -void vp9_frameworker_unlock_stats(VPxWorker *const worker) { -#if CONFIG_MULTITHREAD - FrameWorkerData *const worker_data = worker->data1; - pthread_mutex_unlock(&worker_data->stats_mutex); -#else - (void)worker; -#endif -} - -void vp9_frameworker_signal_stats(VPxWorker *const worker) { -#if CONFIG_MULTITHREAD - FrameWorkerData *const worker_data = worker->data1; - -// TODO(hkuang): Fix the pthread_cond_broadcast in windows wrapper. -#if defined(_WIN32) && !HAVE_PTHREAD_H - pthread_cond_signal(&worker_data->stats_cond); -#else - pthread_cond_broadcast(&worker_data->stats_cond); -#endif - -#else - (void)worker; -#endif -} - -// This macro prevents thread_sanitizer from reporting known concurrent writes. -#if defined(__has_feature) -#if __has_feature(thread_sanitizer) -#define BUILDING_WITH_TSAN -#endif -#endif - -// TODO(hkuang): Remove worker parameter as it is only used in debug code. -void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf, - int row) { -#if CONFIG_MULTITHREAD - if (!ref_buf) return; - -#ifndef BUILDING_WITH_TSAN - // The following line of code will get harmless tsan error but it is the key - // to get best performance. - if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return; -#endif - - { - // Find the worker thread that owns the reference frame. If the reference - // frame has been fully decoded, it may not have owner. - VPxWorker *const ref_worker = ref_buf->frame_worker_owner; - FrameWorkerData *const ref_worker_data = - (FrameWorkerData *)ref_worker->data1; - const VP9Decoder *const pbi = ref_worker_data->pbi; - -#ifdef DEBUG_THREAD - { - FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; - printf("%d %p worker is waiting for %d %p worker (%d) ref %d \r\n", - worker_data->worker_id, worker, ref_worker_data->worker_id, - ref_buf->frame_worker_owner, row, ref_buf->row); - } -#endif - - vp9_frameworker_lock_stats(ref_worker); - while (ref_buf->row < row && pbi->cur_buf == ref_buf && - ref_buf->buf.corrupted != 1) { - pthread_cond_wait(&ref_worker_data->stats_cond, - &ref_worker_data->stats_mutex); - } - - if (ref_buf->buf.corrupted == 1) { - FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; - vp9_frameworker_unlock_stats(ref_worker); - vpx_internal_error(&worker_data->pbi->common.error, - VPX_CODEC_CORRUPT_FRAME, - "Worker %p failed to decode frame", worker); - } - vp9_frameworker_unlock_stats(ref_worker); - } -#else - (void)worker; - (void)ref_buf; - (void)row; - (void)ref_buf; -#endif // CONFIG_MULTITHREAD -} - -void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row) { -#if CONFIG_MULTITHREAD - VPxWorker *worker = buf->frame_worker_owner; - -#ifdef DEBUG_THREAD - { - FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; - printf("%d %p worker decode to (%d) \r\n", worker_data->worker_id, - buf->frame_worker_owner, row); - } -#endif - - vp9_frameworker_lock_stats(worker); - buf->row = row; - vp9_frameworker_signal_stats(worker); - vp9_frameworker_unlock_stats(worker); -#else - (void)buf; - (void)row; -#endif // CONFIG_MULTITHREAD -} - -void vp9_frameworker_copy_context(VPxWorker *const dst_worker, - VPxWorker *const src_worker) { -#if CONFIG_MULTITHREAD - FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1; - FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1; - VP9_COMMON *const src_cm = &src_worker_data->pbi->common; - VP9_COMMON *const dst_cm = &dst_worker_data->pbi->common; - int i; - - // Wait until source frame's context is ready. - vp9_frameworker_lock_stats(src_worker); - while (!src_worker_data->frame_context_ready) { - pthread_cond_wait(&src_worker_data->stats_cond, - &src_worker_data->stats_mutex); - } - - dst_cm->last_frame_seg_map = src_cm->seg.enabled - ? src_cm->current_frame_seg_map - : src_cm->last_frame_seg_map; - dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync; - vp9_frameworker_unlock_stats(src_worker); - - dst_cm->bit_depth = src_cm->bit_depth; -#if CONFIG_VP9_HIGHBITDEPTH - dst_cm->use_highbitdepth = src_cm->use_highbitdepth; -#endif - dst_cm->prev_frame = - src_cm->show_existing_frame ? src_cm->prev_frame : src_cm->cur_frame; - dst_cm->last_width = - !src_cm->show_existing_frame ? src_cm->width : src_cm->last_width; - dst_cm->last_height = - !src_cm->show_existing_frame ? src_cm->height : src_cm->last_height; - dst_cm->subsampling_x = src_cm->subsampling_x; - dst_cm->subsampling_y = src_cm->subsampling_y; - dst_cm->frame_type = src_cm->frame_type; - dst_cm->last_show_frame = !src_cm->show_existing_frame - ? src_cm->show_frame - : src_cm->last_show_frame; - for (i = 0; i < REF_FRAMES; ++i) - dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i]; - - memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr, - (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh)); - dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level; - dst_cm->lf.filter_level = src_cm->lf.filter_level; - memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, MAX_REF_LF_DELTAS); - memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS); - dst_cm->seg = src_cm->seg; - memcpy(dst_cm->frame_contexts, src_cm->frame_contexts, - FRAME_CONTEXTS * sizeof(dst_cm->frame_contexts[0])); -#else - (void)dst_worker; - (void)src_worker; -#endif // CONFIG_MULTITHREAD -} diff --git a/libvpx/vp9/decoder/vp9_dthread.h b/libvpx/vp9/decoder/vp9_dthread.h deleted file mode 100644 index fce0fe7fe..000000000 --- a/libvpx/vp9/decoder/vp9_dthread.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_DECODER_VP9_DTHREAD_H_ -#define VP9_DECODER_VP9_DTHREAD_H_ - -#include "./vpx_config.h" -#include "vpx_util/vpx_thread.h" -#include "vpx/internal/vpx_codec_internal.h" - -#ifdef __cplusplus -extern "C" { -#endif - -struct VP9Common; -struct VP9Decoder; - -// WorkerData for the FrameWorker thread. It contains all the information of -// the worker and decode structures for decoding a frame. -typedef struct FrameWorkerData { - struct VP9Decoder *pbi; - const uint8_t *data; - const uint8_t *data_end; - size_t data_size; - void *user_priv; - int result; - int worker_id; - int received_frame; - - // scratch_buffer is used in frame parallel mode only. - // It is used to make a copy of the compressed data. - uint8_t *scratch_buffer; - size_t scratch_buffer_size; - -#if CONFIG_MULTITHREAD - pthread_mutex_t stats_mutex; - pthread_cond_t stats_cond; -#endif - - int frame_context_ready; // Current frame's context is ready to read. - int frame_decoded; // Finished decoding current frame. -} FrameWorkerData; - -void vp9_frameworker_lock_stats(VPxWorker *const worker); -void vp9_frameworker_unlock_stats(VPxWorker *const worker); -void vp9_frameworker_signal_stats(VPxWorker *const worker); - -// Wait until ref_buf has been decoded to row in real pixel unit. -// Note: worker may already finish decoding ref_buf and release it in order to -// start decoding next frame. So need to check whether worker is still decoding -// ref_buf. -void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf, - int row); - -// FrameWorker broadcasts its decoding progress so other workers that are -// waiting on it can resume decoding. -void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row); - -// Copy necessary decoding context from src worker to dst worker. -void vp9_frameworker_copy_context(VPxWorker *const dst_worker, - VPxWorker *const src_worker); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // VP9_DECODER_VP9_DTHREAD_H_ diff --git a/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c new file mode 100644 index 000000000..e46f789ba --- /dev/null +++ b/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c @@ -0,0 +1,843 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" +#include "vp9/common/vp9_blockd.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_scale/yv12config.h" + +// Note: The scaling functions could write extra rows and columns in dst, which +// exceed the right and bottom boundaries of the destination frame. We rely on +// the following frame extension function to fix these rows and columns. + +static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src, + const int src_stride, + uint8_t *dst, + const int dst_stride, const int w, + const int h) { + const int max_width = (w + 15) & ~15; + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + const uint8x16x2_t s = vld2q_u8(src); + vst1q_u8(dst, s.val[0]); + src += 32; + dst += 16; + x -= 16; + } while (x); + src += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE void scale_plane_4_to_1_phase_0(const uint8_t *src, + const int src_stride, + uint8_t *dst, + const int dst_stride, const int w, + const int h) { + const int max_width = (w + 15) & ~15; + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + const uint8x16x4_t s = vld4q_u8(src); + vst1q_u8(dst, s.val[0]); + src += 64; + dst += 16; + x -= 16; + } while (x); + src += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE void scale_plane_bilinear_kernel( + const uint8x16_t in0, const uint8x16_t in1, const uint8x16_t in2, + const uint8x16_t in3, const uint8x8_t coef0, const uint8x8_t coef1, + uint8_t *const dst) { + const uint16x8_t h0 = vmull_u8(vget_low_u8(in0), coef0); + const uint16x8_t h1 = vmull_u8(vget_high_u8(in0), coef0); + const uint16x8_t h2 = vmull_u8(vget_low_u8(in2), coef0); + const uint16x8_t h3 = vmull_u8(vget_high_u8(in2), coef0); + const uint16x8_t h4 = vmlal_u8(h0, vget_low_u8(in1), coef1); + const uint16x8_t h5 = vmlal_u8(h1, vget_high_u8(in1), coef1); + const uint16x8_t h6 = vmlal_u8(h2, vget_low_u8(in3), coef1); + const uint16x8_t h7 = vmlal_u8(h3, vget_high_u8(in3), coef1); + + const uint8x8_t hor0 = vrshrn_n_u16(h4, 7); // temp: 00 01 02 03 04 05 06 07 + const uint8x8_t hor1 = vrshrn_n_u16(h5, 7); // temp: 08 09 0A 0B 0C 0D 0E 0F + const uint8x8_t hor2 = vrshrn_n_u16(h6, 7); // temp: 10 11 12 13 14 15 16 17 + const uint8x8_t hor3 = vrshrn_n_u16(h7, 7); // temp: 18 19 1A 1B 1C 1D 1E 1F + const uint16x8_t v0 = vmull_u8(hor0, coef0); + const uint16x8_t v1 = vmull_u8(hor1, coef0); + const uint16x8_t v2 = vmlal_u8(v0, hor2, coef1); + const uint16x8_t v3 = vmlal_u8(v1, hor3, coef1); + // dst: 0 1 2 3 4 5 6 7 8 9 A B C D E F + const uint8x16_t d = vcombine_u8(vrshrn_n_u16(v2, 7), vrshrn_n_u16(v3, 7)); + vst1q_u8(dst, d); +} + +static INLINE void scale_plane_2_to_1_bilinear( + const uint8_t *const src, const int src_stride, uint8_t *dst, + const int dst_stride, const int w, const int h, const int16_t c0, + const int16_t c1) { + const int max_width = (w + 15) & ~15; + const uint8_t *src0 = src; + const uint8_t *src1 = src + src_stride; + const uint8x8_t coef0 = vdup_n_u8(c0); + const uint8x8_t coef1 = vdup_n_u8(c1); + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + // 000 002 004 006 008 00A 00C 00E 010 012 014 016 018 01A 01C 01E + // 001 003 005 007 009 00B 00D 00F 011 013 015 017 019 01B 01D 01F + const uint8x16x2_t s0 = vld2q_u8(src0); + // 100 102 104 106 108 10A 10C 10E 110 112 114 116 118 11A 11C 11E + // 101 103 105 107 109 10B 10D 10F 111 113 115 117 119 11B 11D 11F + const uint8x16x2_t s1 = vld2q_u8(src1); + scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1], + coef0, coef1, dst); + src0 += 32; + src1 += 32; + dst += 16; + x -= 16; + } while (x); + src0 += 2 * (src_stride - max_width); + src1 += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE void scale_plane_4_to_1_bilinear( + const uint8_t *const src, const int src_stride, uint8_t *dst, + const int dst_stride, const int w, const int h, const int16_t c0, + const int16_t c1) { + const int max_width = (w + 15) & ~15; + const uint8_t *src0 = src; + const uint8_t *src1 = src + src_stride; + const uint8x8_t coef0 = vdup_n_u8(c0); + const uint8x8_t coef1 = vdup_n_u8(c1); + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + // (*) -- useless + // 000 004 008 00C 010 014 018 01C 020 024 028 02C 030 034 038 03C + // 001 005 009 00D 011 015 019 01D 021 025 029 02D 031 035 039 03D + // 002 006 00A 00E 012 016 01A 01E 022 026 02A 02E 032 036 03A 03E (*) + // 003 007 00B 00F 013 017 01B 01F 023 027 02B 02F 033 037 03B 03F (*) + const uint8x16x4_t s0 = vld4q_u8(src0); + // 100 104 108 10C 110 114 118 11C 120 124 128 12C 130 134 138 13C + // 101 105 109 10D 111 115 119 11D 121 125 129 12D 131 135 139 13D + // 102 106 10A 10E 112 116 11A 11E 122 126 12A 12E 132 136 13A 13E (*) + // 103 107 10B 10F 113 117 11B 11F 123 127 12B 12F 133 137 13B 13F (*) + const uint8x16x4_t s1 = vld4q_u8(src1); + scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1], + coef0, coef1, dst); + src0 += 64; + src1 += 64; + dst += 16; + x -= 16; + } while (x); + src0 += 4 * (src_stride - max_width); + src1 += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE uint8x8_t scale_filter_bilinear(const uint8x8_t *const s, + const uint8x8_t *const coef) { + const uint16x8_t h0 = vmull_u8(s[0], coef[0]); + const uint16x8_t h1 = vmlal_u8(h0, s[1], coef[1]); + + return vrshrn_n_u16(h1, 7); +} + +static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 3) & ~3; + const int width_ver = (w + 7) & ~7; + const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 3) & ~3; + const int16x8_t filters = vld1q_s16(coef); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[14], d[4]; + + assert(w && h); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1; + + // horizontal 4x8 + // Note: processing 4x8 is about 20% faster than processing row by row using + // vld4_u8(). + do { + load_u8_8x8(src + 2, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); + x = width_hor; + + do { + src += 8; + load_u8_8x8(src, src_stride, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11], + &s[12], &s[13]); + transpose_u8_8x8(&s[6], &s[7], &s[8], &s[9], &s[10], &s[11], &s[12], + &s[13]); + + d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70 + d[1] = scale_filter_8(&s[2], filters); // 01 11 21 31 41 51 61 71 + d[2] = scale_filter_8(&s[4], filters); // 02 12 22 32 42 52 62 72 + d[3] = scale_filter_8(&s[6], filters); // 03 13 23 33 43 53 63 73 + // 00 01 02 03 40 41 42 43 + // 10 11 12 13 50 51 52 53 + // 20 21 22 23 60 61 62 63 + // 30 31 32 33 70 71 72 73 + transpose_u8_8x4(&d[0], &d[1], &d[2], &d[3]); + vst1_lane_u32((uint32_t *)(t + 0 * width_hor), vreinterpret_u32_u8(d[0]), + 0); + vst1_lane_u32((uint32_t *)(t + 1 * width_hor), vreinterpret_u32_u8(d[1]), + 0); + vst1_lane_u32((uint32_t *)(t + 2 * width_hor), vreinterpret_u32_u8(d[2]), + 0); + vst1_lane_u32((uint32_t *)(t + 3 * width_hor), vreinterpret_u32_u8(d[3]), + 0); + vst1_lane_u32((uint32_t *)(t + 4 * width_hor), vreinterpret_u32_u8(d[0]), + 1); + vst1_lane_u32((uint32_t *)(t + 5 * width_hor), vreinterpret_u32_u8(d[1]), + 1); + vst1_lane_u32((uint32_t *)(t + 6 * width_hor), vreinterpret_u32_u8(d[2]), + 1); + vst1_lane_u32((uint32_t *)(t + 7 * width_hor), vreinterpret_u32_u8(d[3]), + 1); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + + t += 4; + x -= 4; + } while (x); + src += 8 * src_stride - 2 * width_hor; + t += 7 * width_hor; + y -= 8; + } while (y); + + // vertical 8x4 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x8(t, width_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + t += 6 * width_hor; + y = height_ver; + + do { + load_u8_8x8(t, width_hor, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11], + &s[12], &s[13]); + t += 8 * width_hor; + + d[0] = scale_filter_8(&s[0], filters); // 00 01 02 03 04 05 06 07 + d[1] = scale_filter_8(&s[2], filters); // 10 11 12 13 14 15 16 17 + d[2] = scale_filter_8(&s[4], filters); // 20 21 22 23 24 25 26 27 + d[3] = scale_filter_8(&s[6], filters); // 30 31 32 33 34 35 36 37 + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + vst1_u8(dst + 2 * dst_stride, d[2]); + vst1_u8(dst + 3 * dst_stride, d[3]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + + dst += 4 * dst_stride; + y -= 4; + } while (y); + t -= width_hor * (2 * height_ver + 6); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 1) & ~1; + const int width_ver = (w + 7) & ~7; + const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 1) & ~1; + const int16x8_t filters = vld1q_s16(coef); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[12], d[2]; + + assert(w && h); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3; + + // horizontal 2x8 + // Note: processing 2x8 is about 20% faster than processing row by row using + // vld4_u8(). + do { + load_u8_8x8(src + 4, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + transpose_u8_4x8(&s[0], &s[1], &s[2], &s[3], s[4], s[5], s[6], s[7]); + x = width_hor; + + do { + uint8x8x2_t dd; + src += 8; + load_u8_8x8(src, src_stride, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9], + &s[10], &s[11]); + transpose_u8_8x8(&s[4], &s[5], &s[6], &s[7], &s[8], &s[9], &s[10], + &s[11]); + + d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70 + d[1] = scale_filter_8(&s[4], filters); // 01 11 21 31 41 51 61 71 + // dd.val[0]: 00 01 20 21 40 41 60 61 + // dd.val[1]: 10 11 30 31 50 51 70 71 + dd = vtrn_u8(d[0], d[1]); + vst1_lane_u16((uint16_t *)(t + 0 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 0); + vst1_lane_u16((uint16_t *)(t + 1 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 0); + vst1_lane_u16((uint16_t *)(t + 2 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 1); + vst1_lane_u16((uint16_t *)(t + 3 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 1); + vst1_lane_u16((uint16_t *)(t + 4 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 2); + vst1_lane_u16((uint16_t *)(t + 5 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 2); + vst1_lane_u16((uint16_t *)(t + 6 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 3); + vst1_lane_u16((uint16_t *)(t + 7 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 3); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + + t += 2; + x -= 2; + } while (x); + src += 8 * src_stride - 4 * width_hor; + t += 7 * width_hor; + y -= 8; + } while (y); + + // vertical 8x2 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x4(t, width_hor, &s[0], &s[1], &s[2], &s[3]); + t += 4 * width_hor; + y = height_ver; + + do { + load_u8_8x8(t, width_hor, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9], + &s[10], &s[11]); + t += 8 * width_hor; + + d[0] = scale_filter_8(&s[0], filters); // 00 01 02 03 04 05 06 07 + d[1] = scale_filter_8(&s[4], filters); // 10 11 12 13 14 15 16 17 + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + + dst += 2 * dst_stride; + y -= 2; + } while (y); + t -= width_hor * (4 * height_ver + 4); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +// Notes for 4 to 3 scaling: +// +// 1. 6 rows are calculated in each horizontal inner loop, so width_hor must be +// multiple of 6, and no less than w. +// +// 2. 8 rows are calculated in each vertical inner loop, so width_ver must be +// multiple of 8, and no less than w. +// +// 3. 8 columns are calculated in each horizontal inner loop for further +// vertical scaling, so height_hor must be multiple of 8, and no less than +// 4 * h / 3. +// +// 4. 6 columns are calculated in each vertical inner loop, so height_ver must +// be multiple of 6, and no less than h. +// +// 5. The physical location of the last row of the 4 to 3 scaled frame is +// decided by phase_scaler, and are always less than 1 pixel below the last row +// of the original image. + +static void scale_plane_4_to_3_bilinear(const uint8_t *src, + const int src_stride, uint8_t *dst, + const int dst_stride, const int w, + const int h, const int phase_scaler, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = width_hor + 2; // store 2 extra pixels + const int width_ver = (w + 7) & ~7; + // We only need 1 extra row below because there are only 2 bilinear + // coefficients. + const int height_hor = (4 * h / 3 + 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[9], d[8], c[6]; + + assert(w && h); + + c[0] = vdup_n_u8((uint8_t)vp9_filter_kernels[BILINEAR][phase_scaler][3]); + c[1] = vdup_n_u8((uint8_t)vp9_filter_kernels[BILINEAR][phase_scaler][4]); + c[2] = vdup_n_u8( + (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 1 * step_q4) & + SUBPEL_MASK][3]); + c[3] = vdup_n_u8( + (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 1 * step_q4) & + SUBPEL_MASK][4]); + c[4] = vdup_n_u8( + (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 2 * step_q4) & + SUBPEL_MASK][3]); + c[5] = vdup_n_u8( + (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 2 * step_q4) & + SUBPEL_MASK][4]); + + d[6] = vdup_n_u8(0); + d[7] = vdup_n_u8(0); + + // horizontal 6x8 + do { + load_u8_8x8(src, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + src += 1; + transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); + x = width_hor; + + do { + load_u8_8x8(src, src_stride, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7], &s[8]); + src += 8; + transpose_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7], &s[8]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = scale_filter_bilinear(&s[0], &c[0]); + d[1] = + scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]); + d[2] = + scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]); + d[3] = scale_filter_bilinear(&s[4], &c[0]); + d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], + &c[2]); + d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], + &c[4]); + + // 00 01 02 03 04 05 xx xx + // 10 11 12 13 14 15 xx xx + // 20 21 22 23 24 25 xx xx + // 30 31 32 33 34 35 xx xx + // 40 41 42 43 44 45 xx xx + // 50 51 52 53 54 55 xx xx + // 60 61 62 63 64 65 xx xx + // 70 71 72 73 74 75 xx xx + transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); + // store 2 extra pixels + vst1_u8(t + 0 * stride_hor, d[0]); + vst1_u8(t + 1 * stride_hor, d[1]); + vst1_u8(t + 2 * stride_hor, d[2]); + vst1_u8(t + 3 * stride_hor, d[3]); + vst1_u8(t + 4 * stride_hor, d[4]); + vst1_u8(t + 5 * stride_hor, d[5]); + vst1_u8(t + 6 * stride_hor, d[6]); + vst1_u8(t + 7 * stride_hor, d[7]); + + s[0] = s[8]; + + t += 6; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3 - 1; + t += 7 * stride_hor + 2; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + t += stride_hor; + y = height_ver; + + do { + load_u8_8x8(t, stride_hor, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7], &s[8]); + t += 8 * stride_hor; + + d[0] = scale_filter_bilinear(&s[0], &c[0]); + d[1] = + scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]); + d[2] = + scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]); + d[3] = scale_filter_bilinear(&s[4], &c[0]); + d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], + &c[2]); + d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], + &c[4]); + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + vst1_u8(dst + 2 * dst_stride, d[2]); + vst1_u8(dst + 3 * dst_stride, d[3]); + vst1_u8(dst + 4 * dst_stride, d[4]); + vst1_u8(dst + 5 * dst_stride, d[5]); + + s[0] = s[8]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * (4 * height_ver / 3 + 1); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const InterpKernel *const coef, + const int phase_scaler, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = width_hor + 2; // store 2 extra pixels + const int width_ver = (w + 7) & ~7; + // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows + // above and (SUBPEL_TAPS / 2) extra rows below. + const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + const int16x8_t filters0 = + vld1q_s16(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK]); + const int16x8_t filters1 = + vld1q_s16(coef[(phase_scaler + 1 * step_q4) & SUBPEL_MASK]); + const int16x8_t filters2 = + vld1q_s16(coef[(phase_scaler + 2 * step_q4) & SUBPEL_MASK]); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[15], d[8]; + + assert(w && h); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2; + d[6] = vdup_n_u8(0); + d[7] = vdup_n_u8(0); + + // horizontal 6x8 + do { + load_u8_8x8(src + 1, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); + x = width_hor; + + do { + src += 8; + load_u8_8x8(src, src_stride, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12], + &s[13], &s[14]); + transpose_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11], &s[12], &s[13], + &s[14]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = scale_filter_8(&s[0], filters0); + d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1); + d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2); + d[3] = scale_filter_8(&s[4], filters0); + d[4] = + scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1); + d[5] = + scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2); + + // 00 01 02 03 04 05 xx xx + // 10 11 12 13 14 15 xx xx + // 20 21 22 23 24 25 xx xx + // 30 31 32 33 34 35 xx xx + // 40 41 42 43 44 45 xx xx + // 50 51 52 53 54 55 xx xx + // 60 61 62 63 64 65 xx xx + // 70 71 72 73 74 75 xx xx + transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); + // store 2 extra pixels + vst1_u8(t + 0 * stride_hor, d[0]); + vst1_u8(t + 1 * stride_hor, d[1]); + vst1_u8(t + 2 * stride_hor, d[2]); + vst1_u8(t + 3 * stride_hor, d[3]); + vst1_u8(t + 4 * stride_hor, d[4]); + vst1_u8(t + 5 * stride_hor, d[5]); + vst1_u8(t + 6 * stride_hor, d[6]); + vst1_u8(t + 7 * stride_hor, d[7]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + s[6] = s[14]; + + t += 6; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3; + t += 7 * stride_hor + 2; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + t += 7 * stride_hor; + y = height_ver; + + do { + load_u8_8x8(t, stride_hor, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12], + &s[13], &s[14]); + t += 8 * stride_hor; + + d[0] = scale_filter_8(&s[0], filters0); + d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1); + d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2); + d[3] = scale_filter_8(&s[4], filters0); + d[4] = + scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1); + d[5] = + scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2); + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + vst1_u8(dst + 2 * dst_stride, d[2]); + vst1_u8(dst + 3 * dst_stride, d[3]); + vst1_u8(dst + 4 * dst_stride, d[4]); + vst1_u8(dst + 5 * dst_stride, d[5]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + s[6] = s[14]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * (4 * height_ver / 3 + 7); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +void vp9_scale_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + INTERP_FILTER filter_type, + int phase_scaler) { + const int src_w = src->y_crop_width; + const int src_h = src->y_crop_height; + const int dst_w = dst->y_crop_width; + const int dst_h = dst->y_crop_height; + const int dst_uv_w = dst_w / 2; + const int dst_uv_h = dst_h / 2; + int scaled = 0; + + // phase_scaler is usually 0 or 8. + assert(phase_scaler >= 0 && phase_scaler < 16); + + if (2 * dst_w == src_w && 2 * dst_h == src_h) { + // 2 to 1 + scaled = 1; + if (phase_scaler == 0) { + scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h); + scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + } else if (filter_type == BILINEAR) { + const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3]; + const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4]; + scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, c0, c1); + scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1); + scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1); + } else { + const int buffer_stride = (dst_w + 3) & ~3; + const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (temp_buffer) { + scale_plane_2_to_1_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer); + scale_plane_2_to_1_general( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + scale_plane_2_to_1_general( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + free(temp_buffer); + } else { + scaled = 0; + } + } + } else if (4 * dst_w == src_w && 4 * dst_h == src_h) { + // 4 to 1 + scaled = 1; + if (phase_scaler == 0) { + scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h); + scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + } else if (filter_type == BILINEAR) { + const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3]; + const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4]; + scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, c0, c1); + scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1); + scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1); + } else { + const int buffer_stride = (dst_w + 1) & ~1; + const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (temp_buffer) { + scale_plane_4_to_1_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer); + scale_plane_4_to_1_general( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + scale_plane_4_to_1_general( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + free(temp_buffer); + } else { + scaled = 0; + } + } + } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) { + // 4 to 3 + const int buffer_stride = (dst_w + 5) - ((dst_w + 5) % 6) + 2; + const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (temp_buffer) { + scaled = 1; + if (filter_type == BILINEAR) { + scale_plane_4_to_3_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, phase_scaler, + temp_buffer); + scale_plane_4_to_3_bilinear(src->u_buffer, src->uv_stride, + dst->u_buffer, dst->uv_stride, dst_uv_w, + dst_uv_h, phase_scaler, temp_buffer); + scale_plane_4_to_3_bilinear(src->v_buffer, src->uv_stride, + dst->v_buffer, dst->uv_stride, dst_uv_w, + dst_uv_h, phase_scaler, temp_buffer); + } else { + scale_plane_4_to_3_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer); + scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], + phase_scaler, temp_buffer); + scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], + phase_scaler, temp_buffer); + } + free(temp_buffer); + } + } + + if (scaled) { + vpx_extend_frame_borders(dst); + } else { + // Call c version for all other scaling ratios. + vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler); + } +} diff --git a/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c index 0b175969b..97a09bdff 100644 --- a/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -9,9 +9,10 @@ */ #include <arm_neon.h> - +#include <assert.h> #include <math.h> +#include "./vpx_config.h" #include "vpx_mem/vpx_mem.h" #include "vp9/common/vp9_quant_common.h" @@ -31,86 +32,206 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + int i; + const int16x8_t v_zero = vdupq_n_s16(0); + const int16x8_t v_one = vdupq_n_s16(1); + int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); + int16x8_t v_round = vmovq_n_s16(round_ptr[1]); + int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]); + int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]); + + (void)scan; + (void)skip_block; + assert(!skip_block); + + // adjust for dc + v_round = vsetq_lane_s16(round_ptr[0], v_round, 0); + v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0); + v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0); + // process dc and the first seven ac coeffs + { + const int16x8_t v_iscan = vld1q_s16(&iscan[0]); + const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); + const int32x4_t v_tmp_lo = + vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); + const int32x4_t v_tmp_hi = + vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); + const int16x8_t v_tmp2 = + vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); + const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); + const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); + v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); + store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); + store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff); + v_round = vmovq_n_s16(round_ptr[1]); + v_quant = vmovq_n_s16(quant_ptr[1]); + v_dequant = vmovq_n_s16(dequant_ptr[1]); + } + // now process the rest of the ac coeffs + for (i = 8; i < count; i += 8) { + const int16x8_t v_iscan = vld1q_s16(&iscan[i]); + const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); + const int32x4_t v_tmp_lo = + vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); + const int32x4_t v_tmp_hi = + vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); + const int16x8_t v_tmp2 = + vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); + const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); + const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); + v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); + store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff); + store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff); + } + { + const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210), + vget_high_s16(v_eobmax_76543210)); + const int64x1_t v_eobmax_xx32 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); + const int16x4_t v_eobmax_tmp = + vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); + const int64x1_t v_eobmax_xxx3 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); + const int16x4_t v_eobmax_final = + vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); + + *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); + } +} + +static INLINE int32x4_t extract_sign_bit(int32x4_t a) { + return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31)); +} + +void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan_ptr) { + const int16x8_t one = vdupq_n_s16(1); + const int16x8_t neg_one = vdupq_n_s16(-1); + + // ROUND_POWER_OF_TWO(round_ptr[], 1) + const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); + const int16x8_t quant = vld1q_s16(quant_ptr); + const int16x4_t dequant = vld1_s16(dequant_ptr); + // dequant >> 2 is used similar to zbin as a threshold. + const int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2); + + // Process dc and the first seven ac coeffs. + const uint16x8_t iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + const int16x8_t dequant_mask = + vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh)); + + int16x8_t qcoeff = vaddq_s16(coeff_abs, round); + int32x4_t dqcoeff_0, dqcoeff_1; + int16x8_t dqcoeff; + uint16x8_t eob_max; (void)scan; + (void)count; + (void)skip_block; + assert(!skip_block); + + // coeff * quant_ptr[]) >> 15 + qcoeff = vqdmulhq_s16(qcoeff, quant); + + // Restore sign. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + qcoeff = vandq_s16(qcoeff, dequant_mask); + + // qcoeff * dequant[] / 2 + dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), dequant); + dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]); - if (!skip_block) { - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. + // Add 1 if negative to round towards zero because the C uses division. + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + + dqcoeff = vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); + + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan); + + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); + + iscan_ptr += 8; + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + + { int i; - const int16x8_t v_zero = vdupq_n_s16(0); - const int16x8_t v_one = vdupq_n_s16(1); - int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); - int16x8_t v_round = vmovq_n_s16(round_ptr[1]); - int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]); - int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]); - // adjust for dc - v_round = vsetq_lane_s16(round_ptr[0], v_round, 0); - v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0); - v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0); - // process dc and the first seven ac coeffs - { - const int16x8_t v_iscan = vld1q_s16(&iscan[0]); - const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); - const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); - const int32x4_t v_tmp_lo = - vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); - const int32x4_t v_tmp_hi = - vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); - const int16x8_t v_tmp2 = - vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); - const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); - const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); - const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); - const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); - const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); - const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); - v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); - store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); - store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff); - v_round = vmovq_n_s16(round_ptr[1]); - v_quant = vmovq_n_s16(quant_ptr[1]); - v_dequant = vmovq_n_s16(dequant_ptr[1]); - } - // now process the rest of the ac coeffs - for (i = 8; i < count; i += 8) { - const int16x8_t v_iscan = vld1q_s16(&iscan[i]); - const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i); - const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); - const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); - const int32x4_t v_tmp_lo = - vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); - const int32x4_t v_tmp_hi = - vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); - const int16x8_t v_tmp2 = - vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); - const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); - const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); - const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); - const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); - const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); - const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); - v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); - store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff); - store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff); + const int16x8_t round = vrshrq_n_s16(vmovq_n_s16(round_ptr[1]), 1); + const int16x8_t quant = vmovq_n_s16(quant_ptr[1]); + const int16x8_t dequant_thresh = + vshrq_n_s16(vmovq_n_s16(dequant_ptr[1]), 2); + + // Process the rest of the ac coeffs. + for (i = 8; i < 32 * 32; i += 8) { + const uint16x8_t iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + const int16x8_t dequant_mask = + vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh)); + + int16x8_t qcoeff = vaddq_s16(coeff_abs, round); + int32x4_t dqcoeff_0, dqcoeff_1; + int16x8_t dqcoeff; + + qcoeff = vqdmulhq_s16(qcoeff, quant); + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + qcoeff = vandq_s16(qcoeff, dequant_mask); + + dqcoeff_0 = vmull_n_s16(vget_low_s16(qcoeff), dequant_ptr[1]); + dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]); + + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + + dqcoeff = + vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); + + eob_max = + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan)); + + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); + + iscan_ptr += 8; + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; } + { - const int16x4_t v_eobmax_3210 = vmax_s16( - vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210)); - const int64x1_t v_eobmax_xx32 = - vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); - const int16x4_t v_eobmax_tmp = - vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); - const int64x1_t v_eobmax_xxx3 = - vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); - const int16x4_t v_eobmax_final = - vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); - - *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); + const uint16x4_t eob_max_0 = + vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); + const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); + const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); + vst1_lane_u16(eob_ptr, eob_max_2, 0); } - } else { - memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); - *eob_ptr = 0; } } diff --git a/libvpx/vp9/encoder/vp9_alt_ref_aq.c b/libvpx/vp9/encoder/vp9_alt_ref_aq.c index 3aeefb584..acc3764c7 100644 --- a/libvpx/vp9/encoder/vp9_alt_ref_aq.c +++ b/libvpx/vp9/encoder/vp9_alt_ref_aq.c @@ -15,7 +15,7 @@ struct ALT_REF_AQ { int dummy; }; -struct ALT_REF_AQ *vp9_alt_ref_aq_create() { +struct ALT_REF_AQ *vp9_alt_ref_aq_create(void) { return (struct ALT_REF_AQ *)vpx_malloc(sizeof(struct ALT_REF_AQ)); } diff --git a/libvpx/vp9/encoder/vp9_alt_ref_aq.h b/libvpx/vp9/encoder/vp9_alt_ref_aq.h index 18acd8a85..e508cb44a 100644 --- a/libvpx/vp9/encoder/vp9_alt_ref_aq.h +++ b/libvpx/vp9/encoder/vp9_alt_ref_aq.h @@ -54,7 +54,7 @@ struct ALT_REF_AQ; * * \return Instance of the class */ -struct ALT_REF_AQ *vp9_alt_ref_aq_create(); +struct ALT_REF_AQ *vp9_alt_ref_aq_create(void); /*!\brief Upload segmentation_map to self object * diff --git a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c index 048ea629f..2f2f0055a 100644 --- a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -425,9 +425,10 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { int target_refresh = 0; double weight_segment_target = 0; double weight_segment = 0; + int thresh_low_motion = (cm->width < 720) ? 55 : 20; cr->apply_cyclic_refresh = 1; if (cm->frame_type == KEY_FRAME || cpi->svc.temporal_layer_id > 0 || - (!cpi->use_svc && rc->avg_frame_low_motion < 55 && + (!cpi->use_svc && rc->avg_frame_low_motion < thresh_low_motion && rc->frames_since_key > 40)) { cr->apply_cyclic_refresh = 0; return; diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c index 8433f4edd..d346cd57a 100644 --- a/libvpx/vp9/encoder/vp9_bitstream.c +++ b/libvpx/vp9/encoder/vp9_bitstream.c @@ -919,7 +919,9 @@ int vp9_get_refresh_mask(VP9_COMP *cpi) { } } -static int encode_tile_worker(VP9_COMP *cpi, VP9BitstreamWorkerData *data) { +static int encode_tile_worker(void *arg1, void *arg2) { + VP9_COMP *cpi = (VP9_COMP *)arg1; + VP9BitstreamWorkerData *data = (VP9BitstreamWorkerData *)arg2; MACROBLOCKD *const xd = &data->xd; const int tile_row = 0; vpx_start_encode(&data->bit_writer, data->dest); @@ -995,7 +997,7 @@ static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) { } worker->data1 = cpi; worker->data2 = data; - worker->hook = (VPxWorkerHook)encode_tile_worker; + worker->hook = encode_tile_worker; worker->had_error = 0; if (i < num_workers - 1) { diff --git a/libvpx/vp9/encoder/vp9_block.h b/libvpx/vp9/encoder/vp9_block.h index ab488f48f..724205dd5 100644 --- a/libvpx/vp9/encoder/vp9_block.h +++ b/libvpx/vp9/encoder/vp9_block.h @@ -172,6 +172,14 @@ struct macroblock { uint8_t last_sb_high_content; + int sb_use_mv_part; + + int sb_mvcol_part; + + int sb_mvrow_part; + + int sb_pickmode_part; + // For each superblock: saves the content value (e.g., low/high sad/sumdiff) // based on source sad, prior to encoding the frame. uint8_t content_state_sb; @@ -181,11 +189,15 @@ struct macroblock { // 32x32, 9~24 for 16x16. uint8_t variance_low[25]; - void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride); - void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob); + uint8_t arf_frame_usage; + uint8_t lastgolden_frame_usage; + + void (*fwd_txfm4x4)(const int16_t *input, tran_low_t *output, int stride); + void (*inv_txfm_add)(const tran_low_t *input, uint8_t *dest, int stride, + int eob); #if CONFIG_VP9_HIGHBITDEPTH - void (*highbd_itxm_add)(const tran_low_t *input, uint16_t *dest, int stride, - int eob, int bd); + void (*highbd_inv_txfm_add)(const tran_low_t *input, uint16_t *dest, + int stride, int eob, int bd); #endif }; diff --git a/libvpx/vp9/encoder/vp9_context_tree.h b/libvpx/vp9/encoder/vp9_context_tree.h index 9e4cbb360..73423c075 100644 --- a/libvpx/vp9/encoder/vp9_context_tree.h +++ b/libvpx/vp9/encoder/vp9_context_tree.h @@ -65,6 +65,7 @@ typedef struct { int_mv best_sse_mv; MV_REFERENCE_FRAME best_reference_frame; MV_REFERENCE_FRAME best_zeromv_reference_frame; + int sb_skip_denoising; #endif // motion vector cache for adaptive motion search control in partition diff --git a/libvpx/vp9/encoder/vp9_denoiser.c b/libvpx/vp9/encoder/vp9_denoiser.c index e6933f00d..b08ccaa66 100644 --- a/libvpx/vp9/encoder/vp9_denoiser.c +++ b/libvpx/vp9/encoder/vp9_denoiser.c @@ -21,8 +21,6 @@ #include "vp9/encoder/vp9_denoiser.h" #include "vp9/encoder/vp9_encoder.h" -// OUTPUT_YUV_DENOISED - #ifdef OUTPUT_YUV_DENOISED static void make_grayscale(YV12_BUFFER_CONFIG *yuv); #endif @@ -190,11 +188,13 @@ static VP9_DENOISER_DECISION perform_motion_compensation( VP9_COMMON *const cm, VP9_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs, int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx, int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv, - int num_spatial_layers, int width) { + int num_spatial_layers, int width, int lst_fb_idx, int gld_fb_idx, + int use_svc, int spatial_layer) { const int sse_diff = (ctx->newmv_sse == UINT_MAX) ? 0 : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse); - MV_REFERENCE_FRAME frame; + int frame; + int denoise_layer_idx = 0; MACROBLOCKD *filter_mbd = &mb->e_mbd; MODE_INFO *mi = filter_mbd->mi[0]; MODE_INFO saved_mi; @@ -202,8 +202,10 @@ static VP9_DENOISER_DECISION perform_motion_compensation( struct buf_2d saved_dst[MAX_MB_PLANE]; struct buf_2d saved_pre[MAX_MB_PLANE]; RefBuffer *saved_block_refs[2]; + MV_REFERENCE_FRAME saved_frame; frame = ctx->best_reference_frame; + saved_mi = *mi; if (is_skin && (motion_magnitude > 0 || consec_zeromv < 4)) return COPY_BLOCK; @@ -217,7 +219,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation( // If the best reference frame uses inter-prediction and there is enough of a // difference in sum-squared-error, use it. - if (frame != INTRA_FRAME && + if (frame != INTRA_FRAME && frame != ALTREF_FRAME && (frame != GOLDEN_FRAME || num_spatial_layers == 1) && sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) { mi->ref_frame[0] = ctx->best_reference_frame; @@ -228,7 +230,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation( frame = ctx->best_zeromv_reference_frame; ctx->newmv_sse = ctx->zeromv_sse; // Bias to last reference. - if (num_spatial_layers > 1 || + if (num_spatial_layers > 1 || frame == ALTREF_FRAME || (frame != LAST_FRAME && ((ctx->zeromv_lastref_sse<(5 * ctx->zeromv_sse)>> 2) || denoiser->denoising_level >= kDenHigh))) { @@ -246,6 +248,19 @@ static VP9_DENOISER_DECISION perform_motion_compensation( } } + saved_frame = frame; + // When using SVC, we need to map REF_FRAME to the frame buffer index. + if (use_svc) { + if (frame == LAST_FRAME) + frame = lst_fb_idx + 1; + else if (frame == GOLDEN_FRAME) + frame = gld_fb_idx + 1; + // Shift for the second spatial layer. + if (num_spatial_layers - spatial_layer == 2) + frame = frame + denoiser->num_ref_frames; + denoise_layer_idx = num_spatial_layers - spatial_layer - 1; + } + if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) { // Restore everything to its original state *mi = saved_mi; @@ -279,20 +294,23 @@ static VP9_DENOISER_DECISION perform_motion_compensation( denoiser->running_avg_y[frame].uv_stride, mi_row, mi_col); filter_mbd->plane[2].pre[0].stride = denoiser->running_avg_y[frame].uv_stride; - filter_mbd->plane[0].dst.buf = - block_start(denoiser->mc_running_avg_y.y_buffer, - denoiser->mc_running_avg_y.y_stride, mi_row, mi_col); - filter_mbd->plane[0].dst.stride = denoiser->mc_running_avg_y.y_stride; - filter_mbd->plane[1].dst.buf = - block_start(denoiser->mc_running_avg_y.u_buffer, - denoiser->mc_running_avg_y.uv_stride, mi_row, mi_col); - filter_mbd->plane[1].dst.stride = denoiser->mc_running_avg_y.uv_stride; - filter_mbd->plane[2].dst.buf = - block_start(denoiser->mc_running_avg_y.v_buffer, - denoiser->mc_running_avg_y.uv_stride, mi_row, mi_col); - filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.uv_stride; - - set_ref_ptrs(cm, filter_mbd, frame, NONE); + filter_mbd->plane[0].dst.buf = block_start( + denoiser->mc_running_avg_y[denoise_layer_idx].y_buffer, + denoiser->mc_running_avg_y[denoise_layer_idx].y_stride, mi_row, mi_col); + filter_mbd->plane[0].dst.stride = + denoiser->mc_running_avg_y[denoise_layer_idx].y_stride; + filter_mbd->plane[1].dst.buf = block_start( + denoiser->mc_running_avg_y[denoise_layer_idx].u_buffer, + denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col); + filter_mbd->plane[1].dst.stride = + denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride; + filter_mbd->plane[2].dst.buf = block_start( + denoiser->mc_running_avg_y[denoise_layer_idx].v_buffer, + denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col); + filter_mbd->plane[2].dst.stride = + denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride; + + set_ref_ptrs(cm, filter_mbd, saved_frame, NONE); vp9_build_inter_predictors_sby(filter_mbd, mi_row, mi_col, bs); // Restore everything to its original state @@ -314,9 +332,17 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, int zeromv_filter = 0; VP9_DENOISER *denoiser = &cpi->denoiser; VP9_DENOISER_DECISION decision = COPY_BLOCK; - YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME]; - YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y; + + const int shift = + cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 + ? denoiser->num_ref_frames + : 0; + YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME + shift]; + const int denoise_layer_index = + cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id - 1; + YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y[denoise_layer_index]; uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col); + uint8_t *mc_avg_start = block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col); struct buf_2d src = mb->plane[0].src; @@ -338,8 +364,8 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, VP9_COMMON *const cm = &cpi->common; int j, i; // Loop through the 8x8 sub-blocks. - const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64]; - const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64]; + const int bw = num_8x8_blocks_wide_lookup[bs]; + const int bh = num_8x8_blocks_high_lookup[bs]; const int xmis = VPXMIN(cm->mi_cols - mi_col, bw); const int ymis = VPXMIN(cm->mi_rows - mi_row, bh); const int block_index = mi_row * cm->mi_cols + mi_col; @@ -366,14 +392,12 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, } if (!is_skin && denoiser->denoising_level == kDenHigh) increase_denoising = 1; - // TODO(marpan): There is an issue with denoising for speed 5, - // due to the partitioning scheme based on pickmode. - // Remove this speed constraint when issue is resolved. - if (denoiser->denoising_level >= kDenLow && cpi->oxcf.speed > 5) + if (denoiser->denoising_level >= kDenLow && !ctx->sb_skip_denoising) decision = perform_motion_compensation( &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx, motion_magnitude, is_skin, &zeromv_filter, consec_zeromv, - cpi->svc.number_spatial_layers, cpi->Source->y_width); + cpi->svc.number_spatial_layers, cpi->Source->y_width, cpi->lst_fb_idx, + cpi->gld_fb_idx, cpi->use_svc, cpi->svc.spatial_layer_id); if (decision == FILTER_BLOCK) { decision = vp9_denoiser_filter(src.buf, src.stride, mc_avg_start, @@ -382,12 +406,12 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, } if (decision == FILTER_BLOCK) { - vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0, - NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2, + vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0, 0, + 0, 0, num_4x4_blocks_wide_lookup[bs] << 2, num_4x4_blocks_high_lookup[bs] << 2); } else { // COPY_BLOCK - vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0, - NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2, + vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0, 0, + 0, 0, num_4x4_blocks_wide_lookup[bs] << 2, num_4x4_blocks_high_lookup[bs] << 2); } *denoiser_decision = decision; @@ -423,7 +447,9 @@ static void swap_frame_buffer(YV12_BUFFER_CONFIG *const dest, void vp9_denoiser_update_frame_info( VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame, - int resized, int svc_base_is_key) { + int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized, + int svc_base_is_key, int second_spatial_layer) { + const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0; // Copy source into denoised reference buffers on KEY_FRAME or // if the just encoded frame was resized. For SVC, copy source if the base // spatial layer was key frame. @@ -431,8 +457,10 @@ void vp9_denoiser_update_frame_info( svc_base_is_key) { int i; // Start at 1 so as not to overwrite the INTRA_FRAME - for (i = 1; i < MAX_REF_FRAMES; ++i) - copy_frame(&denoiser->running_avg_y[i], &src); + for (i = 1; i < denoiser->num_ref_frames; ++i) { + if (denoiser->running_avg_y[i + shift].buffer_alloc != NULL) + copy_frame(&denoiser->running_avg_y[i + shift], &src); + } denoiser->reset = 0; return; } @@ -440,29 +468,29 @@ void vp9_denoiser_update_frame_info( // If more than one refresh occurs, must copy frame buffer. if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) > 1) { if (refresh_alt_ref_frame) { - copy_frame(&denoiser->running_avg_y[ALTREF_FRAME], - &denoiser->running_avg_y[INTRA_FRAME]); + copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); } if (refresh_golden_frame) { - copy_frame(&denoiser->running_avg_y[GOLDEN_FRAME], - &denoiser->running_avg_y[INTRA_FRAME]); + copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); } if (refresh_last_frame) { - copy_frame(&denoiser->running_avg_y[LAST_FRAME], - &denoiser->running_avg_y[INTRA_FRAME]); + copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); } } else { if (refresh_alt_ref_frame) { - swap_frame_buffer(&denoiser->running_avg_y[ALTREF_FRAME], - &denoiser->running_avg_y[INTRA_FRAME]); + swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); } if (refresh_golden_frame) { - swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME], - &denoiser->running_avg_y[INTRA_FRAME]); + swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); } if (refresh_last_frame) { - swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME], - &denoiser->running_avg_y[INTRA_FRAME]); + swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); } } } @@ -491,19 +519,110 @@ void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse, } } -int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx, - int ssy, +static int vp9_denoiser_realloc_svc_helper(VP9_COMMON *cm, + VP9_DENOISER *denoiser, int fb_idx) { + int fail = 0; + if (denoiser->running_avg_y[fb_idx].buffer_alloc == NULL) { + fail = + vpx_alloc_frame_buffer(&denoiser->running_avg_y[fb_idx], cm->width, + cm->height, cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, 0); + if (fail) { + vp9_denoiser_free(denoiser); + return 1; + } + } + return 0; +} + +int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser, + int svc_buf_shift, int refresh_alt, + int refresh_gld, int refresh_lst, int alt_fb_idx, + int gld_fb_idx, int lst_fb_idx) { + int fail = 0; + if (refresh_alt) { + // Increase the frame buffer index by 1 to map it to the buffer index in the + // denoiser. + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + alt_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + if (refresh_gld) { + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + gld_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + if (refresh_lst) { + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + lst_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + return 0; +} + +int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser, + int use_svc, int noise_sen, int width, int height, + int ssx, int ssy, #if CONFIG_VP9_HIGHBITDEPTH int use_highbitdepth, #endif int border) { - int i, fail; + int i, layer, fail, init_num_ref_frames; const int legacy_byte_alignment = 0; + int num_layers = 1; + int scaled_width = width; + int scaled_height = height; + if (use_svc) { + LAYER_CONTEXT *lc = &svc->layer_context[svc->spatial_layer_id * + svc->number_temporal_layers + + svc->temporal_layer_id]; + get_layer_resolution(width, height, lc->scaling_factor_num, + lc->scaling_factor_den, &scaled_width, &scaled_height); + // For SVC: only denoise at most 2 spatial (highest) layers. + if (noise_sen >= 2) + // Denoise from one spatial layer below the top. + svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 2, 0); + else + // Only denoise the top spatial layer. + svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 1, 0); + num_layers = svc->number_spatial_layers - svc->first_layer_denoise; + } assert(denoiser != NULL); + denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES; + init_num_ref_frames = use_svc ? MAX_REF_FRAMES : NONSVC_REF_FRAMES; + denoiser->num_layers = num_layers; + CHECK_MEM_ERROR(cm, denoiser->running_avg_y, + vpx_calloc(denoiser->num_ref_frames * num_layers, + sizeof(denoiser->running_avg_y[0]))); + CHECK_MEM_ERROR( + cm, denoiser->mc_running_avg_y, + vpx_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0]))); + + for (layer = 0; layer < num_layers; ++layer) { + const int denoise_width = (layer == 0) ? width : scaled_width; + const int denoise_height = (layer == 0) ? height : scaled_height; + for (i = 0; i < init_num_ref_frames; ++i) { + fail = vpx_alloc_frame_buffer( + &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer], + denoise_width, denoise_height, ssx, ssy, +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth, +#endif + border, legacy_byte_alignment); + if (fail) { + vp9_denoiser_free(denoiser); + return 1; + } +#ifdef OUTPUT_YUV_DENOISED + make_grayscale(&denoiser->running_avg_y[i]); +#endif + } - for (i = 0; i < MAX_REF_FRAMES; ++i) { - fail = vpx_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height, - ssx, ssy, + fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y[layer], + denoise_width, denoise_height, ssx, ssy, #if CONFIG_VP9_HIGHBITDEPTH use_highbitdepth, #endif @@ -512,22 +631,10 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx, vp9_denoiser_free(denoiser); return 1; } -#ifdef OUTPUT_YUV_DENOISED - make_grayscale(&denoiser->running_avg_y[i]); -#endif - } - - fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height, ssx, - ssy, -#if CONFIG_VP9_HIGHBITDEPTH - use_highbitdepth, -#endif - border, legacy_byte_alignment); - if (fail) { - vp9_denoiser_free(denoiser); - return 1; } + // denoiser->last_source only used for noise_estimation, so only for top + // layer. fail = vpx_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy, #if CONFIG_VP9_HIGHBITDEPTH use_highbitdepth, @@ -553,10 +660,18 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser) { return; } denoiser->frame_buffer_initialized = 0; - for (i = 0; i < MAX_REF_FRAMES; ++i) { + for (i = 0; i < denoiser->num_ref_frames * denoiser->num_layers; ++i) { vpx_free_frame_buffer(&denoiser->running_avg_y[i]); } - vpx_free_frame_buffer(&denoiser->mc_running_avg_y); + vpx_free(denoiser->running_avg_y); + denoiser->running_avg_y = NULL; + + for (i = 0; i < denoiser->num_layers; ++i) { + vpx_free_frame_buffer(&denoiser->mc_running_avg_y[i]); + } + + vpx_free(denoiser->mc_running_avg_y); + denoiser->mc_running_avg_y = NULL; vpx_free_frame_buffer(&denoiser->last_source); } @@ -570,7 +685,8 @@ void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level) { denoiser->prev_denoising_level = denoiser->denoising_level; } -// Scale/increase the partition threshold for denoiser speed-up. +// Scale/increase the partition threshold +// for denoiser speed-up. int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level, int content_state, int temporal_layer_id) { if ((content_state == kLowSadLowSumdiff) || @@ -585,7 +701,8 @@ int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level, } } -// Scale/increase the ac skip threshold for denoiser speed-up. +// Scale/increase the ac skip threshold for +// denoiser speed-up. int64_t vp9_scale_acskip_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level, int abs_sumdiff, int temporal_layer_id) { diff --git a/libvpx/vp9/encoder/vp9_denoiser.h b/libvpx/vp9/encoder/vp9_denoiser.h index f0845e113..f4da24cbf 100644 --- a/libvpx/vp9/encoder/vp9_denoiser.h +++ b/libvpx/vp9/encoder/vp9_denoiser.h @@ -21,6 +21,14 @@ extern "C" { #define MOTION_MAGNITUDE_THRESHOLD (8 * 3) +// Denoiser is used in non svc real-time mode which does not use alt-ref, so no +// need to allocate for it, and hence we need MAX_REF_FRAME - 1 +#define NONSVC_REF_FRAMES MAX_REF_FRAMES - 1 + +// Number of frame buffers when SVC is used. [0] for current denoised buffer and +// [1..8] for REF_FRAMES +#define SVC_REF_FRAMES 9 + typedef enum vp9_denoiser_decision { COPY_BLOCK, FILTER_BLOCK, @@ -35,11 +43,13 @@ typedef enum vp9_denoiser_level { } VP9_DENOISER_LEVEL; typedef struct vp9_denoiser { - YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES]; - YV12_BUFFER_CONFIG mc_running_avg_y; + YV12_BUFFER_CONFIG *running_avg_y; + YV12_BUFFER_CONFIG *mc_running_avg_y; YV12_BUFFER_CONFIG last_source; int frame_buffer_initialized; int reset; + int num_ref_frames; + int num_layers; VP9_DENOISER_LEVEL denoising_level; VP9_DENOISER_LEVEL prev_denoising_level; } VP9_DENOISER; @@ -57,11 +67,13 @@ typedef struct { } VP9_PICKMODE_CTX_DEN; struct VP9_COMP; +struct SVC; void vp9_denoiser_update_frame_info( VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame, - int resized, int svc_base_is_key); + int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized, + int svc_base_is_key, int second_spatial_layer); void vp9_denoiser_denoise(struct VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx, @@ -73,8 +85,14 @@ void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse, PREDICTION_MODE mode, PICK_MODE_CONTEXT *ctx); -int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx, - int ssy, +int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser, + int svc_buf_shift, int refresh_alt, + int refresh_gld, int refresh_lst, int alt_fb_idx, + int gld_fb_idx, int lst_fb_idx); + +int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser, + int use_svc, int noise_sen, int width, int height, + int ssx, int ssy, #if CONFIG_VP9_HIGHBITDEPTH int use_highbitdepth, #endif diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c index 6215e198c..682477df1 100644 --- a/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/libvpx/vp9/encoder/vp9_encodeframe.c @@ -125,19 +125,17 @@ static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = { }; #endif // CONFIG_VP9_HIGHBITDEPTH -unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi, - const struct buf_2d *ref, - BLOCK_SIZE bs) { +unsigned int vp9_get_sby_variance(VP9_COMP *cpi, const struct buf_2d *ref, + BLOCK_SIZE bs) { unsigned int sse; const unsigned int var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, VP9_VAR_OFFS, 0, &sse); - return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); + return var; } #if CONFIG_VP9_HIGHBITDEPTH -unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi, - const struct buf_2d *ref, - BLOCK_SIZE bs, int bd) { +unsigned int vp9_high_get_sby_variance(VP9_COMP *cpi, const struct buf_2d *ref, + BLOCK_SIZE bs, int bd) { unsigned int var, sse; switch (bd) { case 10: @@ -157,8 +155,24 @@ unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi, CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8), 0, &sse); break; } - return (unsigned int)ROUND64_POWER_OF_TWO((int64_t)var, - num_pels_log2_lookup[bs]); + return var; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi, + const struct buf_2d *ref, + BLOCK_SIZE bs) { + return ROUND_POWER_OF_TWO(vp9_get_sby_variance(cpi, ref, bs), + num_pels_log2_lookup[bs]); +} + +#if CONFIG_VP9_HIGHBITDEPTH +unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi, + const struct buf_2d *ref, + BLOCK_SIZE bs, int bd) { + return (unsigned int)ROUND64_POWER_OF_TWO( + (int64_t)vp9_high_get_sby_variance(cpi, ref, bs, bd), + num_pels_log2_lookup[bs]); } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -287,8 +301,12 @@ static void set_block_size(VP9_COMP *const cpi, MACROBLOCK *const x, } typedef struct { - int64_t sum_square_error; - int64_t sum_error; + // This struct is used for computing variance in choose_partitioning(), where + // the max number of samples within a superblock is 16x16 (with 4x4 avg). Even + // in high bitdepth, uint32_t is enough for sum_square_error (2^12 * 2^12 * 16 + // * 16 = 2^32). + uint32_t sum_square_error; + int32_t sum_error; int log2_count; int variance; } var; @@ -381,7 +399,7 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) { } // Set variance values given sum square error, sum error, count. -static void fill_variance(int64_t s2, int64_t s, int c, var *v) { +static void fill_variance(uint32_t s2, int32_t s, int c, var *v) { v->sum_square_error = s2; v->sum_error = s; v->log2_count = c; @@ -489,8 +507,9 @@ static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x, return 0; } -int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed, int width, - int height, int content_state) { +static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed, + int width, int height, + int content_state) { if (speed >= 8) { if (width <= 640 && height <= 480) return (5 * threshold_base) >> 2; @@ -554,6 +573,8 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q, #endif thresholds[0] = threshold_base; thresholds[2] = threshold_base << cpi->oxcf.speed; + if (cm->width >= 1280 && cm->height >= 720 && cpi->oxcf.speed < 7) + thresholds[2] = thresholds[2] << 1; if (cm->width <= 352 && cm->height <= 288) { thresholds[0] = threshold_base >> 3; thresholds[1] = threshold_base >> 1; @@ -742,16 +763,7 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res, for (i = 0; i < ymis; i += 2) { for (j = 0; j < xmis; j += 2) { int bl_index = block_index + i * cm->mi_cols + j; - int bl_index1 = bl_index + 1; - int bl_index2 = bl_index + cm->mi_cols; - int bl_index3 = bl_index2 + 1; - int consec_zeromv = - VPXMIN(cpi->consec_zero_mv[bl_index], - VPXMIN(cpi->consec_zero_mv[bl_index1], - VPXMIN(cpi->consec_zero_mv[bl_index2], - cpi->consec_zero_mv[bl_index3]))); - int is_skin = vp9_compute_skin_block( - ysignal, usignal, vsignal, sp, spuv, BLOCK_16X16, consec_zeromv, 0); + int is_skin = cpi->skin_map[bl_index]; num_16x16_skin += is_skin; num_16x16_nonskin += (1 - is_skin); if (num_16x16_nonskin > 3) { @@ -849,7 +861,7 @@ static void copy_partitioning_helper(VP9_COMP *cpi, MACROBLOCK *x, int start_pos = mi_row * cm->mi_stride + mi_col; const int bsl = b_width_log2_lookup[bsize]; - const int bs = (1 << bsl) / 4; + const int bs = (1 << bsl) >> 2; BLOCK_SIZE subsize; PARTITION_TYPE partition; @@ -895,10 +907,7 @@ static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, int layer = LAYER_IDS_TO_IDX(0, cpi->svc.temporal_layer_id, cpi->svc.number_temporal_layers); const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; - if (lc->is_key_frame || - (cpi->svc.temporal_layer_id != cpi->svc.number_temporal_layers - 1 && - cpi->svc.number_temporal_layers > 1)) - svc_copy_allowed = 0; + if (lc->is_key_frame || !cpi->svc.non_reference_frame) svc_copy_allowed = 0; frames_since_key_thresh = cpi->svc.number_spatial_layers << 1; } if (cpi->rc.frames_since_key > frames_since_key_thresh && svc_copy_allowed && @@ -917,13 +926,165 @@ static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, return 0; } -static void update_prev_partition(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, - int mi_col) { +static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int mi_row_high, int mi_col_high) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + BLOCK_SIZE *prev_part = svc->prev_partition_svc; + // Variables with _high are for higher resolution. + int bsize_high = 0; + int subsize_high = 0; + const int bsl_high = b_width_log2_lookup[bsize]; + const int bs_high = (1 << bsl_high) >> 2; + const int has_rows = (mi_row_high + bs_high) < cm->mi_rows; + const int has_cols = (mi_col_high + bs_high) < cm->mi_cols; + + const int row_boundary_block_scale_factor[BLOCK_SIZES] = { + 13, 13, 13, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0 + }; + const int col_boundary_block_scale_factor[BLOCK_SIZES] = { + 13, 13, 13, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0 + }; + int start_pos; + BLOCK_SIZE bsize_low; + PARTITION_TYPE partition_high; + + if (mi_row_high >= cm->mi_rows || mi_col_high >= cm->mi_cols) return 0; + if (mi_row >= (cm->mi_rows >> 1) || mi_col >= (cm->mi_cols >> 1)) return 0; + + // Find corresponding (mi_col/mi_row) block down-scaled by 2x2. + start_pos = mi_row * (svc->mi_stride[svc->spatial_layer_id - 1]) + mi_col; + bsize_low = prev_part[start_pos]; + // The block size is too big for boundaries. Do variance based partitioning. + if ((!has_rows || !has_cols) && bsize_low > BLOCK_16X16) return 1; + + // For reference frames: return 1 (do variance-based partitioning) if the + // superblock is not low source sad and lower-resoln bsize is below 32x32. + if (!cpi->svc.non_reference_frame && !x->skip_low_source_sad && + bsize_low < BLOCK_32X32) + return 1; + + // Scale up block size by 2x2. Force 64x64 for size larger than 32x32. + if (bsize_low < BLOCK_32X32) { + bsize_high = bsize_low + 3; + } else if (bsize_low >= BLOCK_32X32) { + bsize_high = BLOCK_64X64; + } + // Scale up blocks on boundary. + if (!has_cols && has_rows) { + bsize_high = bsize_low + row_boundary_block_scale_factor[bsize_low]; + } else if (has_cols && !has_rows) { + bsize_high = bsize_low + col_boundary_block_scale_factor[bsize_low]; + } else if (!has_cols && !has_rows) { + bsize_high = bsize_low; + } + + partition_high = partition_lookup[bsl_high][bsize_high]; + subsize_high = get_subsize(bsize, partition_high); + + if (subsize_high < BLOCK_8X8) { + set_block_size(cpi, x, xd, mi_row_high, mi_col_high, bsize_high); + } else { + const int bsl = b_width_log2_lookup[bsize]; + const int bs = (1 << bsl) >> 2; + switch (partition_high) { + case PARTITION_NONE: + set_block_size(cpi, x, xd, mi_row_high, mi_col_high, bsize_high); + break; + case PARTITION_HORZ: + set_block_size(cpi, x, xd, mi_row_high, mi_col_high, subsize_high); + if (subsize_high < BLOCK_64X64) + set_block_size(cpi, x, xd, mi_row_high + bs_high, mi_col_high, + subsize_high); + break; + case PARTITION_VERT: + set_block_size(cpi, x, xd, mi_row_high, mi_col_high, subsize_high); + if (subsize_high < BLOCK_64X64) + set_block_size(cpi, x, xd, mi_row_high, mi_col_high + bs_high, + subsize_high); + break; + case PARTITION_SPLIT: + if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, mi_col, + mi_row_high, mi_col_high)) + return 1; + if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row + (bs >> 1), + mi_col, mi_row_high + bs_high, mi_col_high)) + return 1; + if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, + mi_col + (bs >> 1), mi_row_high, + mi_col_high + bs_high)) + return 1; + if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row + (bs >> 1), + mi_col + (bs >> 1), mi_row_high + bs_high, + mi_col_high + bs_high)) + return 1; + break; + default: assert(0); + } + } + + return 0; +} + +static void update_partition_svc(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + VP9_COMMON *const cm = &cpi->common; + BLOCK_SIZE *prev_part = cpi->svc.prev_partition_svc; + int start_pos = mi_row * cm->mi_stride + mi_col; + const int bsl = b_width_log2_lookup[bsize]; + const int bs = (1 << bsl) >> 2; + BLOCK_SIZE subsize; + PARTITION_TYPE partition; + const MODE_INFO *mi = NULL; + int xx, yy; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + mi = cm->mi_grid_visible[start_pos]; + partition = partition_lookup[bsl][mi->sb_type]; + subsize = get_subsize(bsize, partition); + if (subsize < BLOCK_8X8) { + prev_part[start_pos] = bsize; + } else { + switch (partition) { + case PARTITION_NONE: + prev_part[start_pos] = bsize; + if (bsize == BLOCK_64X64) { + for (xx = 0; xx < 8; xx += 4) + for (yy = 0; yy < 8; yy += 4) { + if ((mi_row + xx < cm->mi_rows) && (mi_col + yy < cm->mi_cols)) + prev_part[start_pos + xx * cm->mi_stride + yy] = bsize; + } + } + break; + case PARTITION_HORZ: + prev_part[start_pos] = subsize; + if (mi_row + bs < cm->mi_rows) + prev_part[start_pos + bs * cm->mi_stride] = subsize; + break; + case PARTITION_VERT: + prev_part[start_pos] = subsize; + if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize; + break; + case PARTITION_SPLIT: + update_partition_svc(cpi, subsize, mi_row, mi_col); + update_partition_svc(cpi, subsize, mi_row + bs, mi_col); + update_partition_svc(cpi, subsize, mi_row, mi_col + bs); + update_partition_svc(cpi, subsize, mi_row + bs, mi_col + bs); + break; + default: assert(0); + } + } +} + +static void update_prev_partition_helper(VP9_COMP *cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col) { VP9_COMMON *const cm = &cpi->common; BLOCK_SIZE *prev_part = cpi->prev_partition; int start_pos = mi_row * cm->mi_stride + mi_col; const int bsl = b_width_log2_lookup[bsize]; - const int bs = (1 << bsl) / 4; + const int bs = (1 << bsl) >> 2; BLOCK_SIZE subsize; PARTITION_TYPE partition; const MODE_INFO *mi = NULL; @@ -948,16 +1109,26 @@ static void update_prev_partition(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize; break; case PARTITION_SPLIT: - update_prev_partition(cpi, subsize, mi_row, mi_col); - update_prev_partition(cpi, subsize, mi_row + bs, mi_col); - update_prev_partition(cpi, subsize, mi_row, mi_col + bs); - update_prev_partition(cpi, subsize, mi_row + bs, mi_col + bs); + update_prev_partition_helper(cpi, subsize, mi_row, mi_col); + update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col); + update_prev_partition_helper(cpi, subsize, mi_row, mi_col + bs); + update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col + bs); break; default: assert(0); } } } +static void update_prev_partition(VP9_COMP *cpi, MACROBLOCK *x, int segment_id, + int mi_row, int mi_col, int sb_offset) { + update_prev_partition_helper(cpi, BLOCK_64X64, mi_row, mi_col); + cpi->prev_segment_id[sb_offset] = segment_id; + memcpy(&(cpi->prev_variance_low[sb_offset * 25]), x->variance_low, + sizeof(x->variance_low)); + // Reset the counter for copy partitioning + cpi->copied_frame_cnt[sb_offset] = 0; +} + static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize, unsigned int y_sad, int is_key_frame) { int i; @@ -989,8 +1160,8 @@ static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize, } } -static void avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift, - int sb_offset) { +static uint64_t avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift, + int sb_offset) { unsigned int tmp_sse; uint64_t tmp_sad; unsigned int tmp_variance; @@ -1002,7 +1173,7 @@ static void avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift, uint64_t avg_source_sad_threshold = 10000; uint64_t avg_source_sad_threshold2 = 12000; #if CONFIG_VP9_HIGHBITDEPTH - if (cpi->common.use_highbitdepth) return; + if (cpi->common.use_highbitdepth) return 0; #endif src_y += shift; last_src_y += shift; @@ -1019,8 +1190,12 @@ static void avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift, : kHighSadHighSumdiff; // Detect large lighting change. - if (tmp_variance < (tmp_sse >> 3) && (tmp_sse - tmp_variance) > 10000) + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN && + cpi->oxcf.rc_mode == VPX_CBR && tmp_variance < (tmp_sse >> 3) && + (tmp_sse - tmp_variance) > 10000) x->content_state_sb = kLowVarHighSumdiff; + else if (tmp_sad > (avg_source_sad_threshold << 1)) + x->content_state_sb = kVeryHighSad; if (cpi->content_state_sb_fd != NULL) { if (tmp_sad < avg_source_sad_threshold2) { @@ -1031,7 +1206,7 @@ static void avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift, cpi->content_state_sb_fd[sb_offset] = 0; } } - return; + return tmp_sad; } // This function chooses partitioning based on the variance between source and @@ -1042,7 +1217,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, MACROBLOCKD *xd = &x->e_mbd; int i, j, k, m; v64x64 vt; - v16x16 vt2[16]; + v16x16 *vt2 = NULL; int force_split[21]; int avg_32x32; int max_var_32x32 = 0; @@ -1058,6 +1233,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, const uint8_t *d; int sp; int dp; + int compute_minmax_variance = 1; unsigned int y_sad = UINT_MAX; BLOCK_SIZE bsize = BLOCK_64X64; // Ref frame used in partitioning. @@ -1082,6 +1258,11 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); segment_id = xd->mi[0]->segment_id; + if (cpi->oxcf.speed >= 8 || (cpi->use_svc && cpi->svc.non_reference_frame)) + compute_minmax_variance = 0; + + memset(x->variance_low, 0, sizeof(x->variance_low)); + if (cpi->sf.use_source_sad && !is_key_frame) { int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3); content_state = x->content_state_sb; @@ -1092,9 +1273,27 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, x->lowvar_highsumdiff = (content_state == kLowVarHighSumdiff) ? 1 : 0; if (cpi->content_state_sb_fd != NULL) x->last_sb_high_content = cpi->content_state_sb_fd[sb_offset2]; + + // For SVC on top spatial layer: use/scale the partition from + // the lower spatial resolution if svc_use_lowres_part is enabled. + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 && + cpi->svc.prev_partition_svc != NULL && content_state != kVeryHighSad) { + if (!scale_partitioning_svc(cpi, x, xd, BLOCK_64X64, mi_row >> 1, + mi_col >> 1, mi_row, mi_col)) { + if (cpi->sf.copy_partition_flag) { + update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset); + } + return 0; + } + } // If source_sad is low copy the partition without computing the y_sad. if (x->skip_low_source_sad && cpi->sf.copy_partition_flag && copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) { + x->sb_use_mv_part = 1; + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) + update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); return 0; } } @@ -1110,8 +1309,6 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, // For non keyframes, disable 4x4 average for low resolution when speed = 8 threshold_4x4avg = (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : INT64_MAX; - memset(x->variance_low, 0, sizeof(x->variance_low)); - if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3); if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3); @@ -1171,12 +1368,17 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, mi->mv[0].as_int = 0; mi->interp_filter = BILINEAR; - if (cpi->oxcf.speed >= 8 && !low_res) + if (cpi->oxcf.speed >= 8 && !low_res && + x->content_state_sb != kVeryHighSad) { y_sad = cpi->fn_ptr[bsize].sdf( x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride); - else + } else { y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col); + x->sb_use_mv_part = 1; + x->sb_mvcol_part = mi->mv[0].as_mv.col; + x->sb_mvrow_part = mi->mv[0].as_mv.row; + } y_sad_last = y_sad; // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad @@ -1197,7 +1399,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64); - x->sb_is_skin = skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split); + if (cpi->use_skin_detection) + x->sb_is_skin = + skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split); d = xd->plane[0].dst.buf; dp = xd->plane[0].dst.stride; @@ -1212,6 +1416,12 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_64X64); x->variance_low[0] = 1; chroma_check(cpi, x, bsize, y_sad, is_key_frame); + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) + update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); + if (cpi->sf.copy_partition_flag) { + update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset); + } return 0; } } @@ -1223,6 +1433,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, if (cpi->sf.copy_partition_flag && y_sad_last < cpi->vbp_threshold_copy && copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) { chroma_check(cpi, x, bsize, y_sad, is_key_frame); + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) + update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); return 0; } } else { @@ -1240,6 +1453,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, #endif // CONFIG_VP9_HIGHBITDEPTH } + if (low_res && threshold_4x4avg < INT64_MAX) + CHECK_MEM_ERROR(cm, vt2, vpx_calloc(16, sizeof(*vt2))); // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances // for splits. for (i = 0; i < 4; i++) { @@ -1276,7 +1491,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, force_split[split_index] = 1; force_split[i + 1] = 1; force_split[0] = 1; - } else if (cpi->oxcf.speed < 8 && + } else if (compute_minmax_variance && vt.split[i].split[j].part_variances.none.variance > thresholds[1] && !cyclic_refresh_segment_id_boosted(segment_id)) { @@ -1288,7 +1503,10 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, xd->cur_buf->flags, #endif pixels_wide, pixels_high); - if (minmax > cpi->vbp_threshold_minmax) { + int thresh_minmax = (int)cpi->vbp_threshold_minmax; + if (x->content_state_sb == kVeryHighSad) + thresh_minmax = thresh_minmax << 1; + if (minmax > thresh_minmax) { force_split[split_index] = 1; force_split[i + 1] = 1; force_split[0] = 1; @@ -1431,21 +1649,20 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } if (cm->frame_type != KEY_FRAME && cpi->sf.copy_partition_flag) { - update_prev_partition(cpi, BLOCK_64X64, mi_row, mi_col); - cpi->prev_segment_id[sb_offset] = segment_id; - memcpy(&(cpi->prev_variance_low[sb_offset * 25]), x->variance_low, - sizeof(x->variance_low)); - // Reset the counter for copy partitioning - if (cpi->copied_frame_cnt[sb_offset] == cpi->max_copied_frame) - cpi->copied_frame_cnt[sb_offset] = 0; + update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset); } + if (cm->frame_type != KEY_FRAME && cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) + update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); + if (cpi->sf.short_circuit_low_temp_var) { set_low_temp_var_flag(cpi, x, xd, &vt, thresholds, ref_frame_partition, mi_col, mi_row); } chroma_check(cpi, x, bsize, y_sad, is_key_frame); + if (vt2) vpx_free(vt2); return 0; } @@ -3480,7 +3697,7 @@ static TX_MODE select_tx_mode(const VP9_COMP *cpi, MACROBLOCKD *const xd) { static void hybrid_intra_mode_search(VP9_COMP *cpi, MACROBLOCK *const x, RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { - if (bsize < BLOCK_16X16) + if (!cpi->sf.nonrd_keyframe && bsize < BLOCK_16X16) vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX); else vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx); @@ -3644,6 +3861,9 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, !force_horz_split && xss <= yss && bsize >= BLOCK_8X8; (void)*tp_orig; + // Avoid checking for rectangular partitions for speed >= 6. + if (cpi->oxcf.speed >= 6) do_rect = 0; + assert(num_8x8_blocks_wide_lookup[bsize] == num_8x8_blocks_high_lookup[bsize]); @@ -3871,6 +4091,8 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, PARTITION_TYPE partition; BLOCK_SIZE subsize; RD_COST this_rdc; + BLOCK_SIZE subsize_ref = + (cpi->sf.adapt_partition_source_sad) ? BLOCK_8X8 : BLOCK_16X16; vp9_rd_cost_reset(&this_rdc); if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; @@ -3884,7 +4106,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, rd_cost, 0, INT64_MAX, pc_tree); } else if (bsize == BLOCK_32X32 && partition != PARTITION_NONE && - subsize >= BLOCK_16X16) { + subsize >= subsize_ref) { x->max_partition_size = BLOCK_32X32; x->min_partition_size = BLOCK_8X8; nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, rd_cost, @@ -4132,6 +4354,10 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row, sb_col_in_tile); + if (cpi->use_skin_detection) { + vp9_compute_skin_sb(cpi, BLOCK_16X16, mi_row, mi_col); + } + x->source_variance = UINT_MAX; vp9_zero(x->pred_mv); vp9_rd_cost_init(&dummy_rdc); @@ -4141,6 +4367,12 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, x->skip_low_source_sad = 0; x->lowvar_highsumdiff = 0; x->content_state_sb = 0; + x->sb_use_mv_part = 0; + x->sb_mvcol_part = 0; + x->sb_mvrow_part = 0; + x->sb_pickmode_part = 0; + x->arf_frame_usage = 0; + x->lastgolden_frame_usage = 0; if (seg->enabled) { const uint8_t *const map = @@ -4155,7 +4387,12 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, if (cpi->compute_source_sad_onepass && cpi->sf.use_source_sad) { int shift = cpi->Source->y_stride * (mi_row << 3) + (mi_col << 3); int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3); - avg_source_sad(cpi, x, shift, sb_offset2); + int64_t source_sad = avg_source_sad(cpi, x, shift, sb_offset2); + if (sf->adapt_partition_source_sad && + (cpi->oxcf.rc_mode == VPX_VBR && !cpi->rc.is_src_frame_alt_ref && + source_sad > sf->adapt_partition_thresh && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) + partition_search_type = REFERENCE_PARTITION; } // Set the partition type of the 64X64 block @@ -4181,12 +4418,14 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, BLOCK_64X64, 1, &dummy_rdc, td->pc_root); break; case REFERENCE_PARTITION: + x->sb_pickmode_part = 1; set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64); // Use nonrd_pick_partition on scene-cut for VBR mode. // nonrd_pick_partition does not support 4x4 partition, so avoid it // on key frame for now. if ((cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.high_source_sad && - cm->frame_type != KEY_FRAME)) { + cpi->oxcf.speed < 6 && cm->frame_type != KEY_FRAME && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { // Use lower max_partition_size for low resoultions. if (cm->width <= 352 && cm->height <= 288) x->max_partition_size = BLOCK_32X32; @@ -4213,12 +4452,34 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, default: assert(0); break; } + // Update ref_frame usage for inter frame if this group is ARF group. + if (!cpi->rc.is_src_frame_alt_ref && !cpi->refresh_golden_frame && + !cpi->refresh_alt_ref_frame && cpi->rc.alt_ref_gf_group && + cpi->sf.use_altref_onepass) { + int sboffset = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3); + if (cpi->count_arf_frame_usage != NULL) + cpi->count_arf_frame_usage[sboffset] = x->arf_frame_usage; + if (cpi->count_lastgolden_frame_usage != NULL) + cpi->count_lastgolden_frame_usage[sboffset] = x->lastgolden_frame_usage; + } + (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row, sb_col_in_tile, num_sb_cols); } } // end RTC play code +static INLINE uint32_t variance(const diff *const d) { + return d->sse - (uint32_t)(((int64_t)d->sum * d->sum) >> 8); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE uint32_t variance_highbd(diff *const d) { + const int64_t var = (int64_t)d->sse - (((int64_t)d->sum * d->sum) >> 8); + return (var >= 0) ? (uint32_t)var : 0; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + static int set_var_thresh_from_histogram(VP9_COMP *cpi) { const SPEED_FEATURES *const sf = &cpi->sf; const VP9_COMMON *const cm = &cpi->common; @@ -4248,14 +4509,17 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) { case VPX_BITS_8: vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); + var16->var = variance(var16); break; case VPX_BITS_10: vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); + var16->var = variance_highbd(var16); break; case VPX_BITS_12: vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); + var16->var = variance_highbd(var16); break; default: assert(0 && @@ -4266,12 +4530,13 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) { } else { vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); + var16->var = variance(var16); } #else vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); + var16->var = variance(var16); #endif // CONFIG_VP9_HIGHBITDEPTH - var16->var = var16->sse - (((uint32_t)var16->sum * var16->sum) >> 8); if (var16->var >= VAR_HIST_MAX_BG_VAR) hist[VAR_HIST_BINS - 1]++; @@ -4482,15 +4747,15 @@ static void encode_frame_internal(VP9_COMP *cpi) { #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) - x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4; + x->fwd_txfm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4; else - x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4; - x->highbd_itxm_add = + x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4; + x->highbd_inv_txfm_add = xd->lossless ? vp9_highbd_iwht4x4_add : vp9_highbd_idct4x4_add; #else - x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4; + x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4; #endif // CONFIG_VP9_HIGHBITDEPTH - x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; + x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; if (xd->lossless) x->optimize = 0; @@ -4733,8 +4998,31 @@ void vp9_encode_frame(VP9_COMP *cpi) { } } } else { + FRAME_COUNTS *counts = cpi->td.counts; cm->reference_mode = SINGLE_REFERENCE; + if (cpi->allow_comp_inter_inter && cpi->sf.use_compound_nonrd_pickmode && + cpi->rc.alt_ref_gf_group && !cpi->rc.is_src_frame_alt_ref && + cm->frame_type != KEY_FRAME) + cm->reference_mode = REFERENCE_MODE_SELECT; + encode_frame_internal(cpi); + + if (cm->reference_mode == REFERENCE_MODE_SELECT) { + int single_count_zero = 0; + int comp_count_zero = 0; + int i; + for (i = 0; i < COMP_INTER_CONTEXTS; i++) { + single_count_zero += counts->comp_inter[i][0]; + comp_count_zero += counts->comp_inter[i][1]; + } + if (comp_count_zero == 0) { + cm->reference_mode = SINGLE_REFERENCE; + vp9_zero(counts->comp_inter); + } else if (single_count_zero == 0) { + cm->reference_mode = COMPOUND_REFERENCE; + vp9_zero(counts->comp_inter); + } + } } // If segmented AQ is enabled compute the average AQ weighting. diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c index 7e30499c5..f3c17f255 100644 --- a/libvpx/vp9/encoder/vp9_encodemb.c +++ b/libvpx/vp9/encoder/vp9_encodemb.c @@ -49,283 +49,258 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { pd->dst.buf, pd->dst.stride); } -typedef struct vp9_token_state { - int64_t error; - int rate; - int16_t next; - int16_t token; - tran_low_t qc; - tran_low_t dqc; - uint8_t best_index; -} vp9_token_state; - static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { { 10, 6 }, { 8, 5 }, }; -#define UPDATE_RD_COST() \ - { \ - rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); \ - rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); \ - } - -// This function is a place holder for now but may ultimately need -// to scan previous tokens to work out the correct context. -static int trellis_get_coeff_context(const int16_t *scan, const int16_t *nb, - int idx, int token, uint8_t *token_cache) { - int bak = token_cache[scan[idx]], pt; - token_cache[scan[idx]] = vp9_pt_energy_class[token]; - pt = get_coef_context(nb, token_cache, idx + 1); - token_cache[scan[idx]] = bak; - return pt; -} +// 'num' can be negative, but 'shift' must be non-negative. +#define RIGHT_SHIFT_POSSIBLY_NEGATIVE(num, shift) \ + ((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift)) -static const int16_t band_count_table[TX_SIZES][8] = { - { 1, 2, 3, 4, 3, 16 - 13, 0 }, - { 1, 2, 3, 4, 11, 64 - 21, 0 }, - { 1, 2, 3, 4, 11, 256 - 21, 0 }, - { 1, 2, 3, 4, 11, 1024 - 21, 0 }, -}; -static const int16_t band_cum_count_table[TX_SIZES][8] = { - { 0, 1, 3, 6, 10, 13, 16, 0 }, - { 0, 1, 3, 6, 10, 21, 64, 0 }, - { 0, 1, 3, 6, 10, 21, 256, 0 }, - { 0, 1, 3, 6, 10, 21, 1024, 0 }, -}; int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, int ctx) { MACROBLOCKD *const xd = &mb->e_mbd; struct macroblock_plane *const p = &mb->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; const int ref = is_inter_block(xd->mi[0]); - vp9_token_state tokens[1025][2]; uint8_t token_cache[1024]; - const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block); + const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); const int eob = p->eobs[block]; - const PLANE_TYPE type = get_plane_type(plane); + const PLANE_TYPE plane_type = get_plane_type(plane); const int default_eob = 16 << (tx_size << 1); const int shift = (tx_size == TX_32X32); const int16_t *const dequant_ptr = pd->dequant; const uint8_t *const band_translate = get_band_translate(tx_size); - const scan_order *const so = get_scan(xd, tx_size, type, block); + const scan_order *const so = get_scan(xd, tx_size, plane_type, block); const int16_t *const scan = so->scan; const int16_t *const nb = so->neighbors; - const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift }; - int next = eob, sz = 0; - const int64_t rdmult = ((int64_t)mb->rdmult * plane_rd_mult[ref][type]) >> 1; + const int64_t rdmult = + ((int64_t)mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1; const int64_t rddiv = mb->rddiv; int64_t rd_cost0, rd_cost1; - int rate0, rate1; - int64_t error0, error1; + int64_t rate0, rate1; int16_t t0, t1; - int best, band = (eob < default_eob) ? band_translate[eob] - : band_translate[eob - 1]; - int pt, i, final_eob; + int i, final_eob; #if CONFIG_VP9_HIGHBITDEPTH const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd); #else const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8); #endif - unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = - mb->token_costs[tx_size][type][ref]; - const int16_t *band_counts = &band_count_table[tx_size][band]; - int16_t band_left = eob - band_cum_count_table[tx_size][band] + 1; - - token_costs += band; - - assert((!type && !plane) || (type && plane)); + unsigned int(*const token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = + mb->token_costs[tx_size][plane_type][ref]; + unsigned int(*token_costs_cur)[2][COEFF_CONTEXTS][ENTROPY_TOKENS]; + int64_t eob_cost0, eob_cost1; + const int ctx0 = ctx; + int64_t accu_rate = 0; + // Initialized to the worst possible error for the largest transform size. + // This ensures that it never goes negative. + int64_t accu_error = ((int64_t)1) << 50; + int64_t best_block_rd_cost = INT64_MAX; + int x_prev = 1; + tran_low_t before_best_eob_qc = 0; + tran_low_t before_best_eob_dqc = 0; + + assert((!plane_type && !plane) || (plane_type && plane)); assert(eob <= default_eob); - /* Now set up a Viterbi trellis to evaluate alternative roundings. */ - /* Initialize the sentinel node of the trellis. */ - tokens[eob][0].rate = 0; - tokens[eob][0].error = 0; - tokens[eob][0].next = default_eob; - tokens[eob][0].token = EOB_TOKEN; - tokens[eob][0].qc = 0; - tokens[eob][1] = tokens[eob][0]; - for (i = 0; i < eob; i++) { const int rc = scan[i]; token_cache[rc] = vp9_pt_energy_class[vp9_get_token(qcoeff[rc])]; } + final_eob = 0; - for (i = eob; i-- > 0;) { - int base_bits, d2, dx; + // Initial RD cost. + token_costs_cur = token_costs + band_translate[0]; + rate0 = (*token_costs_cur)[0][ctx0][EOB_TOKEN]; + best_block_rd_cost = RDCOST(rdmult, rddiv, rate0, accu_error); + + // For each token, pick one of two choices greedily: + // (i) First candidate: Keep current quantized value, OR + // (ii) Second candidate: Reduce quantized value by 1. + for (i = 0; i < eob; i++) { const int rc = scan[i]; - int x = qcoeff[rc]; - /* Only add a trellis state for non-zero coefficients. */ - if (x) { - error0 = tokens[next][0].error; - error1 = tokens[next][1].error; - /* Evaluate the first possibility for this state. */ - rate0 = tokens[next][0].rate; - rate1 = tokens[next][1].rate; - base_bits = vp9_get_token_cost(x, &t0, cat6_high_cost); - /* Consider both possible successor states. */ - if (next < default_eob) { - pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); - rate0 += (*token_costs)[0][pt][tokens[next][0].token]; - rate1 += (*token_costs)[0][pt][tokens[next][1].token]; - } - UPDATE_RD_COST(); - /* And pick the best. */ - best = rd_cost1 < rd_cost0; - dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift); + const int x = qcoeff[rc]; + const int band_cur = band_translate[i]; + const int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i); + const int token_tree_sel_cur = (x_prev == 0); + token_costs_cur = token_costs + band_cur; + if (x == 0) { // No need to search + const int token = vp9_get_token(x); + rate0 = (*token_costs_cur)[token_tree_sel_cur][ctx_cur][token]; + accu_rate += rate0; + x_prev = 0; + // Note: accu_error does not change. + } else { + const int dqv = dequant_ptr[rc != 0]; + // Compute the distortion for quantizing to 0. + const int diff_for_zero_raw = (0 - coeff[rc]) * (1 << shift); + const int diff_for_zero = #if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - dx >>= xd->bd - 8; - } + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff_for_zero_raw, xd->bd - 8) + : +#endif + diff_for_zero_raw; + const int64_t distortion_for_zero = + (int64_t)diff_for_zero * diff_for_zero; + + // Compute the distortion for the first candidate + const int diff0_raw = (dqcoeff[rc] - coeff[rc]) * (1 << shift); + const int diff0 = +#if CONFIG_VP9_HIGHBITDEPTH + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff0_raw, xd->bd - 8) + : #endif // CONFIG_VP9_HIGHBITDEPTH - d2 = dx * dx; - tokens[i][0].rate = base_bits + (best ? rate1 : rate0); - tokens[i][0].error = d2 + (best ? error1 : error0); - tokens[i][0].next = next; - tokens[i][0].token = t0; - tokens[i][0].qc = x; - tokens[i][0].dqc = dqcoeff[rc]; - tokens[i][0].best_index = best; - - /* Evaluate the second possibility for this state. */ - rate0 = tokens[next][0].rate; - rate1 = tokens[next][1].rate; - - if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) && - (abs(x) * dequant_ptr[rc != 0] < - (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])) { - sz = -(x < 0); - x -= 2 * sz + 1; + diff0_raw; + const int64_t distortion0 = (int64_t)diff0 * diff0; + + // Compute the distortion for the second candidate + const int sign = -(x < 0); // -1 if x is negative and 0 otherwise. + const int x1 = x - 2 * sign - 1; // abs(x1) = abs(x) - 1. + int64_t distortion1; + if (x1 != 0) { + const int dqv_step = +#if CONFIG_VP9_HIGHBITDEPTH + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? dqv >> (xd->bd - 8) + : +#endif // CONFIG_VP9_HIGHBITDEPTH + dqv; + const int diff_step = (dqv_step + sign) ^ sign; + const int diff1 = diff0 - diff_step; + assert(dqv > 0); // We aren't right shifting a negative number above. + distortion1 = (int64_t)diff1 * diff1; } else { - tokens[i][1] = tokens[i][0]; - next = i; - - if (!(--band_left)) { - --band_counts; - band_left = *band_counts; - --token_costs; - } - continue; + distortion1 = distortion_for_zero; } - - /* Consider both possible successor states. */ - if (!x) { - /* If we reduced this coefficient to zero, check to see if - * we need to move the EOB back here. - */ - t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN; - t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN; - base_bits = 0; - } else { - base_bits = vp9_get_token_cost(x, &t0, cat6_high_cost); - t1 = t0; + { + // Calculate RDCost for current coeff for the two candidates. + const int64_t base_bits0 = vp9_get_token_cost(x, &t0, cat6_high_cost); + const int64_t base_bits1 = vp9_get_token_cost(x1, &t1, cat6_high_cost); + rate0 = + base_bits0 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t0]; + rate1 = + base_bits1 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t1]; } - if (next < default_eob) { - if (t0 != EOB_TOKEN) { - pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); - rate0 += (*token_costs)[!x][pt][tokens[next][0].token]; - } - if (t1 != EOB_TOKEN) { - pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache); - rate1 += (*token_costs)[!x][pt][tokens[next][1].token]; + { + int rdcost_better_for_x1, eob_rdcost_better_for_x1; + int dqc0, dqc1; + int64_t best_eob_cost_cur; + int use_x1; + + // Calculate RD Cost effect on the next coeff for the two candidates. + int64_t next_bits0 = 0; + int64_t next_bits1 = 0; + int64_t next_eob_bits0 = 0; + int64_t next_eob_bits1 = 0; + if (i < default_eob - 1) { + int ctx_next, token_tree_sel_next; + const int band_next = band_translate[i + 1]; + const int token_next = + (i + 1 != eob) ? vp9_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN; + unsigned int( + *const token_costs_next)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = + token_costs + band_next; + token_cache[rc] = vp9_pt_energy_class[t0]; + ctx_next = get_coef_context(nb, token_cache, i + 1); + token_tree_sel_next = (x == 0); + next_bits0 = + (*token_costs_next)[token_tree_sel_next][ctx_next][token_next]; + next_eob_bits0 = + (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN]; + token_cache[rc] = vp9_pt_energy_class[t1]; + ctx_next = get_coef_context(nb, token_cache, i + 1); + token_tree_sel_next = (x1 == 0); + next_bits1 = + (*token_costs_next)[token_tree_sel_next][ctx_next][token_next]; + if (x1 != 0) { + next_eob_bits1 = + (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN]; + } } - } - UPDATE_RD_COST(); - /* And pick the best. */ - best = rd_cost1 < rd_cost0; + // Compare the total RD costs for two candidates. + rd_cost0 = RDCOST(rdmult, rddiv, (rate0 + next_bits0), distortion0); + rd_cost1 = RDCOST(rdmult, rddiv, (rate1 + next_bits1), distortion1); + rdcost_better_for_x1 = (rd_cost1 < rd_cost0); + eob_cost0 = RDCOST(rdmult, rddiv, (accu_rate + rate0 + next_eob_bits0), + (accu_error + distortion0 - distortion_for_zero)); + eob_cost1 = eob_cost0; + if (x1 != 0) { + eob_cost1 = + RDCOST(rdmult, rddiv, (accu_rate + rate1 + next_eob_bits1), + (accu_error + distortion1 - distortion_for_zero)); + eob_rdcost_better_for_x1 = (eob_cost1 < eob_cost0); + } else { + eob_rdcost_better_for_x1 = 0; + } -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz; - } else { - dx -= (dequant_ptr[rc != 0] + sz) ^ sz; - } -#else - dx -= (dequant_ptr[rc != 0] + sz) ^ sz; -#endif // CONFIG_VP9_HIGHBITDEPTH - d2 = dx * dx; - - tokens[i][1].rate = base_bits + (best ? rate1 : rate0); - tokens[i][1].error = d2 + (best ? error1 : error0); - tokens[i][1].next = next; - tokens[i][1].token = best ? t1 : t0; - tokens[i][1].qc = x; - - if (x) { - tran_low_t offset = dq_step[rc != 0]; - // The 32x32 transform coefficient uses half quantization step size. - // Account for the rounding difference in the dequantized coefficeint - // value when the quantization index is dropped from an even number - // to an odd number. - if (shift & x) offset += (dequant_ptr[rc != 0] & 0x01); - - if (sz == 0) - tokens[i][1].dqc = dqcoeff[rc] - offset; - else - tokens[i][1].dqc = dqcoeff[rc] + offset; - } else { - tokens[i][1].dqc = 0; - } + // Calculate the two candidate de-quantized values. + dqc0 = dqcoeff[rc]; + dqc1 = 0; + if (rdcost_better_for_x1 + eob_rdcost_better_for_x1) { + if (x1 != 0) { + dqc1 = RIGHT_SHIFT_POSSIBLY_NEGATIVE(x1 * dqv, shift); + } else { + dqc1 = 0; + } + } - tokens[i][1].best_index = best; - /* Finally, make this the new head of the trellis. */ - next = i; - } else { - /* There's no choice to make for a zero coefficient, so we don't - * add a new trellis node, but we do need to update the costs. - */ - pt = get_coef_context(nb, token_cache, i + 1); - t0 = tokens[next][0].token; - t1 = tokens[next][1].token; - /* Update the cost of each path if we're past the EOB token. */ - if (t0 != EOB_TOKEN) { - tokens[next][0].rate += (*token_costs)[1][pt][t0]; - tokens[next][0].token = ZERO_TOKEN; - } - if (t1 != EOB_TOKEN) { - tokens[next][1].rate += (*token_costs)[1][pt][t1]; - tokens[next][1].token = ZERO_TOKEN; + // Pick and record the better quantized and de-quantized values. + if (rdcost_better_for_x1) { + qcoeff[rc] = x1; + dqcoeff[rc] = dqc1; + accu_rate += rate1; + accu_error += distortion1 - distortion_for_zero; + assert(distortion1 <= distortion_for_zero); + token_cache[rc] = vp9_pt_energy_class[t1]; + } else { + accu_rate += rate0; + accu_error += distortion0 - distortion_for_zero; + assert(distortion0 <= distortion_for_zero); + token_cache[rc] = vp9_pt_energy_class[t0]; + } + assert(accu_error >= 0); + x_prev = qcoeff[rc]; // Update based on selected quantized value. + + use_x1 = (x1 != 0) && eob_rdcost_better_for_x1; + best_eob_cost_cur = use_x1 ? eob_cost1 : eob_cost0; + + // Determine whether to move the eob position to i+1 + if (best_eob_cost_cur < best_block_rd_cost) { + best_block_rd_cost = best_eob_cost_cur; + final_eob = i + 1; + if (use_x1) { + before_best_eob_qc = x1; + before_best_eob_dqc = dqc1; + } else { + before_best_eob_qc = x; + before_best_eob_dqc = dqc0; + } + } } - tokens[i][0].best_index = tokens[i][1].best_index = 0; - /* Don't update next, because we didn't add a new node. */ - } - - if (!(--band_left)) { - --band_counts; - band_left = *band_counts; - --token_costs; } } - - /* Now pick the best path through the whole trellis. */ - rate0 = tokens[next][0].rate; - rate1 = tokens[next][1].rate; - error0 = tokens[next][0].error; - error1 = tokens[next][1].error; - t0 = tokens[next][0].token; - t1 = tokens[next][1].token; - rate0 += (*token_costs)[0][ctx][t0]; - rate1 += (*token_costs)[0][ctx][t1]; - UPDATE_RD_COST(); - best = rd_cost1 < rd_cost0; - final_eob = -1; - - for (i = next; i < eob; i = next) { - const int x = tokens[i][best].qc; - const int rc = scan[i]; - if (x) final_eob = i; - qcoeff[rc] = x; - dqcoeff[rc] = tokens[i][best].dqc; - next = tokens[i][best].next; - best = tokens[i][best].best_index; + assert(final_eob <= eob); + if (final_eob > 0) { + int rc; + assert(before_best_eob_qc != 0); + i = final_eob - 1; + rc = scan[i]; + qcoeff[rc] = before_best_eob_qc; + dqcoeff[rc] = before_best_eob_dqc; + } + for (i = final_eob; i < eob; i++) { + int rc = scan[i]; + qcoeff[rc] = 0; + dqcoeff[rc] = 0; } - final_eob++; - mb->plane[plane].eobs[block] = final_eob; return final_eob; } +#undef RIGHT_SHIFT_POSSIBLY_NEGATIVE static INLINE void fdct32x32(int rd_transform, const int16_t *src, tran_low_t *dst, int src_stride) { @@ -358,6 +333,8 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; const int16_t *src_diff; src_diff = &p->src_diff[4 * (row * diff_stride + col)]; + // skip block condition should be handled before this is called. + assert(!x->skip_block); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { @@ -381,7 +358,7 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, scan_order->scan, scan_order->iscan); break; case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); @@ -411,7 +388,7 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, eob, scan_order->scan, scan_order->iscan); break; case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); @@ -432,6 +409,9 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col, const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; const int16_t *src_diff; src_diff = &p->src_diff[4 * (row * diff_stride + col)]; + // skip block condition should be handled before this is called. + assert(!x->skip_block); + #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { switch (tx_size) { @@ -454,7 +434,7 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col, eob); break; case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); @@ -482,7 +462,7 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col, qcoeff, dqcoeff, pd->dequant[0], eob); break; case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); break; @@ -503,6 +483,8 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; const int16_t *src_diff; src_diff = &p->src_diff[4 * (row * diff_stride + col)]; + // skip block condition should be handled before this is called. + assert(!x->skip_block); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { @@ -529,7 +511,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, scan_order->iscan); break; case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, @@ -562,7 +544,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, scan_order->scan, scan_order->iscan); break; case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); @@ -655,8 +637,8 @@ static void encode_block(int plane, int block, int row, int col, // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - x->highbd_itxm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], - xd->bd); + x->highbd_inv_txfm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], + xd->bd); break; default: assert(0 && "Invalid transform size"); } @@ -678,7 +660,7 @@ static void encode_block(int plane, int block, int row, int col, // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); break; default: assert(0 && "Invalid transform size"); break; } @@ -700,12 +682,12 @@ static void encode_block_pass1(int plane, int block, int row, int col, if (p->eobs[block] > 0) { #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - x->highbd_itxm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride, - p->eobs[block], xd->bd); + x->highbd_inv_txfm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride, + p->eobs[block], xd->bd); return; } #endif // CONFIG_VP9_HIGHBITDEPTH - x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); } } @@ -799,6 +781,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, (x->skip_encode || x->fp_src_pred) ? src_stride : dst_stride, dst, dst_stride, col, row, plane); + // skip block condition should be handled before this is called. + assert(!x->skip_block); + #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); @@ -869,7 +854,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, if (tx_type != DCT_DCT) vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type); else - x->fwd_txm4x4(src_diff, coeff, diff_stride); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, @@ -883,7 +868,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, // this is like vp9_short_idct4x4 but has a special case around // eob<=1 which is significant (not just an optimization) for the // lossless case. - x->highbd_itxm_add(dqcoeff, dst16, dst_stride, *eob, xd->bd); + x->highbd_inv_txfm_add(dqcoeff, dst16, dst_stride, *eob, xd->bd); } else { vp9_highbd_iht4x4_16_add(dqcoeff, dst16, dst_stride, tx_type, xd->bd); @@ -951,7 +936,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, if (tx_type != DCT_DCT) vp9_fht4x4(src_diff, coeff, diff_stride, tx_type); else - x->fwd_txm4x4(src_diff, coeff, diff_stride); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); @@ -964,7 +949,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - x->itxm_add(dqcoeff, dst, dst_stride, *eob); + x->inv_txfm_add(dqcoeff, dst, dst_stride, *eob); else vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type); } diff --git a/libvpx/vp9/encoder/vp9_encoder.c b/libvpx/vp9/encoder/vp9_encoder.c index f57f40dbe..2ae59dd98 100644 --- a/libvpx/vp9/encoder/vp9_encoder.c +++ b/libvpx/vp9/encoder/vp9_encoder.c @@ -71,7 +71,6 @@ // mv. Choose a very high value for // now so that HIGH_PRECISION is always // chosen. -// #define OUTPUT_YUV_REC #define FRAME_SIZE_FACTOR 128 // empirical params for context model threshold #define FRAME_RATE_FACTOR 8 @@ -80,7 +79,7 @@ FILE *yuv_denoised_file = NULL; #endif #ifdef OUTPUT_YUV_SKINMAP -FILE *yuv_skinmap_file = NULL; +static FILE *yuv_skinmap_file = NULL; #endif #ifdef OUTPUT_YUV_REC FILE *yuv_rec_file; @@ -438,34 +437,37 @@ static int is_psnr_calc_enabled(VP9_COMP *cpi) { /* clang-format off */ const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = { - { LEVEL_1, 829440, 36864, 200, 400, 2, 1, 4, 8 }, - { LEVEL_1_1, 2764800, 73728, 800, 1000, 2, 1, 4, 8 }, - { LEVEL_2, 4608000, 122880, 1800, 1500, 2, 1, 4, 8 }, - { LEVEL_2_1, 9216000, 245760, 3600, 2800, 2, 2, 4, 8 }, - { LEVEL_3, 20736000, 552960, 7200, 6000, 2, 4, 4, 8 }, - { LEVEL_3_1, 36864000, 983040, 12000, 10000, 2, 4, 4, 8 }, - { LEVEL_4, 83558400, 2228224, 18000, 16000, 4, 4, 4, 8 }, - { LEVEL_4_1, 160432128, 2228224, 30000, 18000, 4, 4, 5, 6 }, - { LEVEL_5, 311951360, 8912896, 60000, 36000, 6, 8, 6, 4 }, - { LEVEL_5_1, 588251136, 8912896, 120000, 46000, 8, 8, 10, 4 }, + // sample rate size breadth bitrate cpb + { LEVEL_1, 829440, 36864, 512, 200, 400, 2, 1, 4, 8 }, + { LEVEL_1_1, 2764800, 73728, 768, 800, 1000, 2, 1, 4, 8 }, + { LEVEL_2, 4608000, 122880, 960, 1800, 1500, 2, 1, 4, 8 }, + { LEVEL_2_1, 9216000, 245760, 1344, 3600, 2800, 2, 2, 4, 8 }, + { LEVEL_3, 20736000, 552960, 2048, 7200, 6000, 2, 4, 4, 8 }, + { LEVEL_3_1, 36864000, 983040, 2752, 12000, 10000, 2, 4, 4, 8 }, + { LEVEL_4, 83558400, 2228224, 4160, 18000, 16000, 4, 4, 4, 8 }, + { LEVEL_4_1, 160432128, 2228224, 4160, 30000, 18000, 4, 4, 5, 6 }, + { LEVEL_5, 311951360, 8912896, 8384, 60000, 36000, 6, 8, 6, 4 }, + { LEVEL_5_1, 588251136, 8912896, 8384, 120000, 46000, 8, 8, 10, 4 }, // TODO(huisu): update max_cpb_size for level 5_2 ~ 6_2 when // they are finalized (currently tentative). - { LEVEL_5_2, 1176502272, 8912896, 180000, 90000, 8, 8, 10, 4 }, - { LEVEL_6, 1176502272, 35651584, 180000, 90000, 8, 16, 10, 4 }, - { LEVEL_6_1, 2353004544u, 35651584, 240000, 180000, 8, 16, 10, 4 }, - { LEVEL_6_2, 4706009088u, 35651584, 480000, 360000, 8, 16, 10, 4 }, + { LEVEL_5_2, 1176502272, 8912896, 8384, 180000, 90000, 8, 8, 10, 4 }, + { LEVEL_6, 1176502272, 35651584, 16832, 180000, 90000, 8, 16, 10, 4 }, + { LEVEL_6_1, 2353004544u, 35651584, 16832, 240000, 180000, 8, 16, 10, 4 }, + { LEVEL_6_2, 4706009088u, 35651584, 16832, 480000, 360000, 8, 16, 10, 4 }, }; /* clang-format on */ -static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = - { "The average bit-rate is too high.", - "The picture size is too large.", - "The luma sample rate is too large.", - "The CPB size is too large.", - "The compression ratio is too small", - "Too many column tiles are used.", - "The alt-ref distance is too small.", - "Too many reference buffers are used." }; +static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = { + "The average bit-rate is too high.", + "The picture size is too large.", + "The picture width/height is too large.", + "The luma sample rate is too large.", + "The CPB size is too large.", + "The compression ratio is too small", + "Too many column tiles are used.", + "The alt-ref distance is too small.", + "Too many reference buffers are used." +}; static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) { switch (mode) { @@ -567,6 +569,8 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) { (double)this_level->max_luma_sample_rate * (1 + SAMPLE_RATE_GRACE_P) || level_spec->max_luma_picture_size > this_level->max_luma_picture_size || + level_spec->max_luma_picture_breadth > + this_level->max_luma_picture_breadth || level_spec->average_bitrate > this_level->average_bitrate || level_spec->max_cpb_size > this_level->max_cpb_size || level_spec->compression_ratio < this_level->compression_ratio || @@ -739,7 +743,9 @@ void vp9_initialize_enc(void) { vp9_init_me_luts(); vp9_rc_init_minq_luts(); vp9_entropy_mv_init(); +#if !CONFIG_REALTIME_ONLY vp9_temporal_filter_init(); +#endif init_done = 1; } } @@ -779,9 +785,15 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { cpi->nmvsadcosts_hp[0] = NULL; cpi->nmvsadcosts_hp[1] = NULL; + vpx_free(cpi->skin_map); + cpi->skin_map = NULL; + vpx_free(cpi->prev_partition); cpi->prev_partition = NULL; + vpx_free(cpi->svc.prev_partition_svc); + cpi->svc.prev_partition_svc = NULL; + vpx_free(cpi->prev_segment_id); cpi->prev_segment_id = NULL; @@ -794,6 +806,11 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { vpx_free(cpi->content_state_sb_fd); cpi->content_state_sb_fd = NULL; + vpx_free(cpi->count_arf_frame_usage); + cpi->count_arf_frame_usage = NULL; + vpx_free(cpi->count_lastgolden_frame_usage); + cpi->count_lastgolden_frame_usage = NULL; + vp9_cyclic_refresh_free(cpi->cyclic_refresh); cpi->cyclic_refresh = NULL; @@ -911,6 +928,7 @@ static void restore_coding_context(VP9_COMP *cpi) { *cm->fc = cc->fc; } +#if !CONFIG_REALTIME_ONLY static void configure_static_seg_features(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; @@ -1034,6 +1052,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) { } } } +#endif // !CONFIG_REALTIME_ONLY static void update_reference_segmentation_map(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; @@ -1203,6 +1222,14 @@ static void set_tile_limits(VP9_COMP *cpi) { clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols); cm->log2_tile_rows = cpi->oxcf.tile_rows; } + + if (cpi->oxcf.target_level == LEVEL_AUTO) { + const int level_tile_cols = + log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height); + if (cm->log2_tile_cols > level_tile_cols) { + cm->log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols); + } + } } static void update_frame_size(VP9_COMP *cpi) { @@ -1318,14 +1345,12 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc, } #if CONFIG_VP9_HIGHBITDEPTH -#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \ - cpi->fn_ptr[BT].sdf = SDF; \ - cpi->fn_ptr[BT].sdaf = SDAF; \ - cpi->fn_ptr[BT].vf = VF; \ - cpi->fn_ptr[BT].svf = SVF; \ - cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].sdx3f = SDX3F; \ - cpi->fn_ptr[BT].sdx8f = SDX8F; \ +#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ cpi->fn_ptr[BT].sdx4df = SDX4DF; #define MAKE_BFP_SAD_WRAPPER(fnname) \ @@ -1364,47 +1389,6 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc, 4; \ } -#define MAKE_BFP_SAD3_WRAPPER(fnname) \ - static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - } \ - static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - int i; \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - for (i = 0; i < 3; i++) sad_array[i] >>= 2; \ - } \ - static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - int i; \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - for (i = 0; i < 3; i++) sad_array[i] >>= 4; \ - } - -#define MAKE_BFP_SAD8_WRAPPER(fnname) \ - static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - } \ - static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - int i; \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - for (i = 0; i < 8; i++) sad_array[i] >>= 2; \ - } \ - static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - int i; \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - for (i = 0; i < 8; i++) sad_array[i] >>= 4; \ - } #define MAKE_BFP_SAD4D_WRAPPER(fnname) \ static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ const uint8_t *const ref_ptr[], int ref_stride, \ @@ -1440,46 +1424,30 @@ MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d) MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad32x32x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad32x32x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d) MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad64x64x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad64x64x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d) MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x16x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x16x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d) MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x8x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x8x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d) MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x16x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x16x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d) MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x8x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x8x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d) MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x4x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d) MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x8x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d) MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad4x4x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x4x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d) static void highbd_set_var_fns(VP9_COMP *const cpi) { @@ -1490,253 +1458,236 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits8, vpx_highbd_sad32x16_avg_bits8, vpx_highbd_8_variance32x16, vpx_highbd_8_sub_pixel_variance32x16, - vpx_highbd_8_sub_pixel_avg_variance32x16, NULL, NULL, + vpx_highbd_8_sub_pixel_avg_variance32x16, vpx_highbd_sad32x16x4d_bits8) HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits8, vpx_highbd_sad16x32_avg_bits8, vpx_highbd_8_variance16x32, vpx_highbd_8_sub_pixel_variance16x32, - vpx_highbd_8_sub_pixel_avg_variance16x32, NULL, NULL, + vpx_highbd_8_sub_pixel_avg_variance16x32, vpx_highbd_sad16x32x4d_bits8) HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits8, vpx_highbd_sad64x32_avg_bits8, vpx_highbd_8_variance64x32, vpx_highbd_8_sub_pixel_variance64x32, - vpx_highbd_8_sub_pixel_avg_variance64x32, NULL, NULL, + vpx_highbd_8_sub_pixel_avg_variance64x32, vpx_highbd_sad64x32x4d_bits8) HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits8, vpx_highbd_sad32x64_avg_bits8, vpx_highbd_8_variance32x64, vpx_highbd_8_sub_pixel_variance32x64, - vpx_highbd_8_sub_pixel_avg_variance32x64, NULL, NULL, + vpx_highbd_8_sub_pixel_avg_variance32x64, vpx_highbd_sad32x64x4d_bits8) HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits8, vpx_highbd_sad32x32_avg_bits8, vpx_highbd_8_variance32x32, vpx_highbd_8_sub_pixel_variance32x32, vpx_highbd_8_sub_pixel_avg_variance32x32, - vpx_highbd_sad32x32x3_bits8, vpx_highbd_sad32x32x8_bits8, vpx_highbd_sad32x32x4d_bits8) HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits8, vpx_highbd_sad64x64_avg_bits8, vpx_highbd_8_variance64x64, vpx_highbd_8_sub_pixel_variance64x64, vpx_highbd_8_sub_pixel_avg_variance64x64, - vpx_highbd_sad64x64x3_bits8, vpx_highbd_sad64x64x8_bits8, vpx_highbd_sad64x64x4d_bits8) HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits8, vpx_highbd_sad16x16_avg_bits8, vpx_highbd_8_variance16x16, vpx_highbd_8_sub_pixel_variance16x16, vpx_highbd_8_sub_pixel_avg_variance16x16, - vpx_highbd_sad16x16x3_bits8, vpx_highbd_sad16x16x8_bits8, vpx_highbd_sad16x16x4d_bits8) - HIGHBD_BFP( - BLOCK_16X8, vpx_highbd_sad16x8_bits8, vpx_highbd_sad16x8_avg_bits8, - vpx_highbd_8_variance16x8, vpx_highbd_8_sub_pixel_variance16x8, - vpx_highbd_8_sub_pixel_avg_variance16x8, vpx_highbd_sad16x8x3_bits8, - vpx_highbd_sad16x8x8_bits8, vpx_highbd_sad16x8x4d_bits8) + HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits8, + vpx_highbd_sad16x8_avg_bits8, vpx_highbd_8_variance16x8, + vpx_highbd_8_sub_pixel_variance16x8, + vpx_highbd_8_sub_pixel_avg_variance16x8, + vpx_highbd_sad16x8x4d_bits8) - HIGHBD_BFP( - BLOCK_8X16, vpx_highbd_sad8x16_bits8, vpx_highbd_sad8x16_avg_bits8, - vpx_highbd_8_variance8x16, vpx_highbd_8_sub_pixel_variance8x16, - vpx_highbd_8_sub_pixel_avg_variance8x16, vpx_highbd_sad8x16x3_bits8, - vpx_highbd_sad8x16x8_bits8, vpx_highbd_sad8x16x4d_bits8) + HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits8, + vpx_highbd_sad8x16_avg_bits8, vpx_highbd_8_variance8x16, + vpx_highbd_8_sub_pixel_variance8x16, + vpx_highbd_8_sub_pixel_avg_variance8x16, + vpx_highbd_sad8x16x4d_bits8) HIGHBD_BFP( BLOCK_8X8, vpx_highbd_sad8x8_bits8, vpx_highbd_sad8x8_avg_bits8, vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8, - vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits8, - vpx_highbd_sad8x8x8_bits8, vpx_highbd_sad8x8x4d_bits8) + vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x4d_bits8) - HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8, - vpx_highbd_sad8x4_avg_bits8, vpx_highbd_8_variance8x4, - vpx_highbd_8_sub_pixel_variance8x4, - vpx_highbd_8_sub_pixel_avg_variance8x4, NULL, - vpx_highbd_sad8x4x8_bits8, vpx_highbd_sad8x4x4d_bits8) + HIGHBD_BFP( + BLOCK_8X4, vpx_highbd_sad8x4_bits8, vpx_highbd_sad8x4_avg_bits8, + vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4, + vpx_highbd_8_sub_pixel_avg_variance8x4, vpx_highbd_sad8x4x4d_bits8) - HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8, - vpx_highbd_sad4x8_avg_bits8, vpx_highbd_8_variance4x8, - vpx_highbd_8_sub_pixel_variance4x8, - vpx_highbd_8_sub_pixel_avg_variance4x8, NULL, - vpx_highbd_sad4x8x8_bits8, vpx_highbd_sad4x8x4d_bits8) + HIGHBD_BFP( + BLOCK_4X8, vpx_highbd_sad4x8_bits8, vpx_highbd_sad4x8_avg_bits8, + vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8, + vpx_highbd_8_sub_pixel_avg_variance4x8, vpx_highbd_sad4x8x4d_bits8) HIGHBD_BFP( BLOCK_4X4, vpx_highbd_sad4x4_bits8, vpx_highbd_sad4x4_avg_bits8, vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4, - vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits8, - vpx_highbd_sad4x4x8_bits8, vpx_highbd_sad4x4x4d_bits8) + vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x4d_bits8) break; case VPX_BITS_10: HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits10, vpx_highbd_sad32x16_avg_bits10, vpx_highbd_10_variance32x16, vpx_highbd_10_sub_pixel_variance32x16, - vpx_highbd_10_sub_pixel_avg_variance32x16, NULL, NULL, + vpx_highbd_10_sub_pixel_avg_variance32x16, vpx_highbd_sad32x16x4d_bits10) HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits10, vpx_highbd_sad16x32_avg_bits10, vpx_highbd_10_variance16x32, vpx_highbd_10_sub_pixel_variance16x32, - vpx_highbd_10_sub_pixel_avg_variance16x32, NULL, NULL, + vpx_highbd_10_sub_pixel_avg_variance16x32, vpx_highbd_sad16x32x4d_bits10) HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits10, vpx_highbd_sad64x32_avg_bits10, vpx_highbd_10_variance64x32, vpx_highbd_10_sub_pixel_variance64x32, - vpx_highbd_10_sub_pixel_avg_variance64x32, NULL, NULL, + vpx_highbd_10_sub_pixel_avg_variance64x32, vpx_highbd_sad64x32x4d_bits10) HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits10, vpx_highbd_sad32x64_avg_bits10, vpx_highbd_10_variance32x64, vpx_highbd_10_sub_pixel_variance32x64, - vpx_highbd_10_sub_pixel_avg_variance32x64, NULL, NULL, + vpx_highbd_10_sub_pixel_avg_variance32x64, vpx_highbd_sad32x64x4d_bits10) HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits10, vpx_highbd_sad32x32_avg_bits10, vpx_highbd_10_variance32x32, vpx_highbd_10_sub_pixel_variance32x32, vpx_highbd_10_sub_pixel_avg_variance32x32, - vpx_highbd_sad32x32x3_bits10, vpx_highbd_sad32x32x8_bits10, vpx_highbd_sad32x32x4d_bits10) HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits10, vpx_highbd_sad64x64_avg_bits10, vpx_highbd_10_variance64x64, vpx_highbd_10_sub_pixel_variance64x64, vpx_highbd_10_sub_pixel_avg_variance64x64, - vpx_highbd_sad64x64x3_bits10, vpx_highbd_sad64x64x8_bits10, vpx_highbd_sad64x64x4d_bits10) HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits10, vpx_highbd_sad16x16_avg_bits10, vpx_highbd_10_variance16x16, vpx_highbd_10_sub_pixel_variance16x16, vpx_highbd_10_sub_pixel_avg_variance16x16, - vpx_highbd_sad16x16x3_bits10, vpx_highbd_sad16x16x8_bits10, vpx_highbd_sad16x16x4d_bits10) HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits10, vpx_highbd_sad16x8_avg_bits10, vpx_highbd_10_variance16x8, vpx_highbd_10_sub_pixel_variance16x8, vpx_highbd_10_sub_pixel_avg_variance16x8, - vpx_highbd_sad16x8x3_bits10, vpx_highbd_sad16x8x8_bits10, vpx_highbd_sad16x8x4d_bits10) HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits10, vpx_highbd_sad8x16_avg_bits10, vpx_highbd_10_variance8x16, vpx_highbd_10_sub_pixel_variance8x16, vpx_highbd_10_sub_pixel_avg_variance8x16, - vpx_highbd_sad8x16x3_bits10, vpx_highbd_sad8x16x8_bits10, vpx_highbd_sad8x16x4d_bits10) - HIGHBD_BFP( - BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad8x8_avg_bits10, - vpx_highbd_10_variance8x8, vpx_highbd_10_sub_pixel_variance8x8, - vpx_highbd_10_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits10, - vpx_highbd_sad8x8x8_bits10, vpx_highbd_sad8x8x4d_bits10) + HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits10, + vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8, + vpx_highbd_10_sub_pixel_variance8x8, + vpx_highbd_10_sub_pixel_avg_variance8x8, + vpx_highbd_sad8x8x4d_bits10) HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits10, vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4, vpx_highbd_10_sub_pixel_variance8x4, - vpx_highbd_10_sub_pixel_avg_variance8x4, NULL, - vpx_highbd_sad8x4x8_bits10, vpx_highbd_sad8x4x4d_bits10) + vpx_highbd_10_sub_pixel_avg_variance8x4, + vpx_highbd_sad8x4x4d_bits10) HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits10, vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8, vpx_highbd_10_sub_pixel_variance4x8, - vpx_highbd_10_sub_pixel_avg_variance4x8, NULL, - vpx_highbd_sad4x8x8_bits10, vpx_highbd_sad4x8x4d_bits10) - - HIGHBD_BFP( - BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad4x4_avg_bits10, - vpx_highbd_10_variance4x4, vpx_highbd_10_sub_pixel_variance4x4, - vpx_highbd_10_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits10, - vpx_highbd_sad4x4x8_bits10, vpx_highbd_sad4x4x4d_bits10) + vpx_highbd_10_sub_pixel_avg_variance4x8, + vpx_highbd_sad4x8x4d_bits10) + + HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits10, + vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4, + vpx_highbd_10_sub_pixel_variance4x4, + vpx_highbd_10_sub_pixel_avg_variance4x4, + vpx_highbd_sad4x4x4d_bits10) break; case VPX_BITS_12: HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12, vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16, vpx_highbd_12_sub_pixel_variance32x16, - vpx_highbd_12_sub_pixel_avg_variance32x16, NULL, NULL, + vpx_highbd_12_sub_pixel_avg_variance32x16, vpx_highbd_sad32x16x4d_bits12) HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits12, vpx_highbd_sad16x32_avg_bits12, vpx_highbd_12_variance16x32, vpx_highbd_12_sub_pixel_variance16x32, - vpx_highbd_12_sub_pixel_avg_variance16x32, NULL, NULL, + vpx_highbd_12_sub_pixel_avg_variance16x32, vpx_highbd_sad16x32x4d_bits12) HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits12, vpx_highbd_sad64x32_avg_bits12, vpx_highbd_12_variance64x32, vpx_highbd_12_sub_pixel_variance64x32, - vpx_highbd_12_sub_pixel_avg_variance64x32, NULL, NULL, + vpx_highbd_12_sub_pixel_avg_variance64x32, vpx_highbd_sad64x32x4d_bits12) HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits12, vpx_highbd_sad32x64_avg_bits12, vpx_highbd_12_variance32x64, vpx_highbd_12_sub_pixel_variance32x64, - vpx_highbd_12_sub_pixel_avg_variance32x64, NULL, NULL, + vpx_highbd_12_sub_pixel_avg_variance32x64, vpx_highbd_sad32x64x4d_bits12) HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits12, vpx_highbd_sad32x32_avg_bits12, vpx_highbd_12_variance32x32, vpx_highbd_12_sub_pixel_variance32x32, vpx_highbd_12_sub_pixel_avg_variance32x32, - vpx_highbd_sad32x32x3_bits12, vpx_highbd_sad32x32x8_bits12, vpx_highbd_sad32x32x4d_bits12) HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits12, vpx_highbd_sad64x64_avg_bits12, vpx_highbd_12_variance64x64, vpx_highbd_12_sub_pixel_variance64x64, vpx_highbd_12_sub_pixel_avg_variance64x64, - vpx_highbd_sad64x64x3_bits12, vpx_highbd_sad64x64x8_bits12, vpx_highbd_sad64x64x4d_bits12) HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits12, vpx_highbd_sad16x16_avg_bits12, vpx_highbd_12_variance16x16, vpx_highbd_12_sub_pixel_variance16x16, vpx_highbd_12_sub_pixel_avg_variance16x16, - vpx_highbd_sad16x16x3_bits12, vpx_highbd_sad16x16x8_bits12, vpx_highbd_sad16x16x4d_bits12) HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits12, vpx_highbd_sad16x8_avg_bits12, vpx_highbd_12_variance16x8, vpx_highbd_12_sub_pixel_variance16x8, vpx_highbd_12_sub_pixel_avg_variance16x8, - vpx_highbd_sad16x8x3_bits12, vpx_highbd_sad16x8x8_bits12, vpx_highbd_sad16x8x4d_bits12) HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits12, vpx_highbd_sad8x16_avg_bits12, vpx_highbd_12_variance8x16, vpx_highbd_12_sub_pixel_variance8x16, vpx_highbd_12_sub_pixel_avg_variance8x16, - vpx_highbd_sad8x16x3_bits12, vpx_highbd_sad8x16x8_bits12, vpx_highbd_sad8x16x4d_bits12) - HIGHBD_BFP( - BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad8x8_avg_bits12, - vpx_highbd_12_variance8x8, vpx_highbd_12_sub_pixel_variance8x8, - vpx_highbd_12_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits12, - vpx_highbd_sad8x8x8_bits12, vpx_highbd_sad8x8x4d_bits12) + HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits12, + vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8, + vpx_highbd_12_sub_pixel_variance8x8, + vpx_highbd_12_sub_pixel_avg_variance8x8, + vpx_highbd_sad8x8x4d_bits12) HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits12, vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4, vpx_highbd_12_sub_pixel_variance8x4, - vpx_highbd_12_sub_pixel_avg_variance8x4, NULL, - vpx_highbd_sad8x4x8_bits12, vpx_highbd_sad8x4x4d_bits12) + vpx_highbd_12_sub_pixel_avg_variance8x4, + vpx_highbd_sad8x4x4d_bits12) HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits12, vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8, vpx_highbd_12_sub_pixel_variance4x8, - vpx_highbd_12_sub_pixel_avg_variance4x8, NULL, - vpx_highbd_sad4x8x8_bits12, vpx_highbd_sad4x8x4d_bits12) - - HIGHBD_BFP( - BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad4x4_avg_bits12, - vpx_highbd_12_variance4x4, vpx_highbd_12_sub_pixel_variance4x4, - vpx_highbd_12_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits12, - vpx_highbd_sad4x4x8_bits12, vpx_highbd_sad4x4x4d_bits12) + vpx_highbd_12_sub_pixel_avg_variance4x8, + vpx_highbd_sad4x8x4d_bits12) + + HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits12, + vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4, + vpx_highbd_12_sub_pixel_variance4x4, + vpx_highbd_12_sub_pixel_avg_variance4x4, + vpx_highbd_sad4x4x4d_bits12) break; default: @@ -1902,6 +1853,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv)); if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_reset_resize(cpi); + rc->rc_1_frame = 0; + rc->rc_2_frame = 0; } if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) || @@ -1912,6 +1865,24 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { (int)cpi->oxcf.target_bandwidth); } + // Check for resetting the rc flags (rc_1_frame, rc_2_frame) if the + // configuration change has a large change in avg_frame_bandwidth. + // For SVC check for resetting based on spatial layer average bandwidth. + // Also reset buffer level to optimal level. + if (cm->current_video_frame > 0) { + if (cpi->use_svc) { + vp9_svc_check_reset_layer_rc_flag(cpi); + } else { + if (rc->avg_frame_bandwidth > (3 * rc->last_avg_frame_bandwidth >> 1) || + rc->avg_frame_bandwidth < (rc->last_avg_frame_bandwidth >> 1)) { + rc->rc_1_frame = 0; + rc->rc_2_frame = 0; + rc->bits_off_target = rc->optimal_buffer_level; + rc->buffer_level = rc->optimal_buffer_level; + } + } + } + cpi->alt_ref_source = NULL; rc->is_src_frame_alt_ref = 0; @@ -2046,6 +2017,9 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, realloc_segmentation_maps(cpi); + CHECK_MEM_ERROR(cm, cpi->skin_map, vpx_calloc(cm->mi_rows * cm->mi_cols, + sizeof(cpi->skin_map[0]))); + CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create()); CHECK_MEM_ERROR( @@ -2162,7 +2136,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, #endif #endif #ifdef OUTPUT_YUV_SKINMAP - yuv_skinmap_file = fopen("skinmap.yuv", "ab"); + yuv_skinmap_file = fopen("skinmap.yuv", "wb"); #endif #ifdef OUTPUT_YUV_REC yuv_rec_file = fopen("rec.yuv", "wb"); @@ -2175,6 +2149,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; +#if !CONFIG_REALTIME_ONLY if (oxcf->pass == 1) { vp9_init_first_pass(cpi); } else if (oxcf->pass == 2) { @@ -2239,6 +2214,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, vp9_init_second_pass(cpi); } } +#endif // !CONFIG_REALTIME_ONLY vp9_set_speed_features_framesize_independent(cpi); vp9_set_speed_features_framesize_dependent(cpi); @@ -2248,67 +2224,61 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, cpi->source_var_thresh = 0; cpi->frames_till_next_var_check = 0; -#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \ - cpi->fn_ptr[BT].sdf = SDF; \ - cpi->fn_ptr[BT].sdaf = SDAF; \ - cpi->fn_ptr[BT].vf = VF; \ - cpi->fn_ptr[BT].svf = SVF; \ - cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].sdx3f = SDX3F; \ - cpi->fn_ptr[BT].sdx8f = SDX8F; \ +#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ cpi->fn_ptr[BT].sdx4df = SDX4DF; BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16, - vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16, NULL, NULL, + vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16, vpx_sad32x16x4d) BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32, - vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32, NULL, NULL, + vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32, vpx_sad16x32x4d) BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32, - vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32, NULL, NULL, + vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32, vpx_sad64x32x4d) BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64, - vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64, NULL, NULL, + vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64, vpx_sad32x64x4d) BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32, vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32, - vpx_sad32x32x3, vpx_sad32x32x8, vpx_sad32x32x4d) + vpx_sad32x32x4d) BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64, vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64, - vpx_sad64x64x3, vpx_sad64x64x8, vpx_sad64x64x4d) + vpx_sad64x64x4d) BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16, vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16, - vpx_sad16x16x3, vpx_sad16x16x8, vpx_sad16x16x4d) + vpx_sad16x16x4d) BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8, - vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x3, - vpx_sad16x8x8, vpx_sad16x8x4d) + vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8, + vpx_sad16x8x4d) BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16, - vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x3, - vpx_sad8x16x8, vpx_sad8x16x4d) + vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16, + vpx_sad8x16x4d) BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8, - vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x3, - vpx_sad8x8x8, vpx_sad8x8x4d) + vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d) BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4, - vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, NULL, - vpx_sad8x4x8, vpx_sad8x4x4d) + vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d) BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8, - vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, NULL, - vpx_sad4x8x8, vpx_sad4x8x4d) + vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d) BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4, - vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x3, - vpx_sad4x4x8, vpx_sad4x4x4d) + vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d) #if CONFIG_VP9_HIGHBITDEPTH highbd_set_var_fns(cpi); @@ -2375,16 +2345,20 @@ void vp9_remove_compressor(VP9_COMP *cpi) { snprintf(headings, sizeof(headings), "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t" "VPXSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t" - "WstPsnr\tWstSsim\tWstFast\tWstHVS"); + "WstPsnr\tWstSsim\tWstFast\tWstHVS\t" + "AVPsnrY\tAPsnrCb\tAPsnrCr"); snprintf(results, sizeof(results), "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t" "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" - "%7.3f\t%7.3f\t%7.3f\t%7.3f", + "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f", dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr, cpi->psnrp.stat[ALL] / cpi->count, totalp_psnr, total_ssim, totalp_ssim, cpi->fastssim.stat[ALL] / cpi->count, cpi->psnrhvs.stat[ALL] / cpi->count, cpi->psnr.worst, - cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst); + cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst, + cpi->psnr.stat[Y] / cpi->count, cpi->psnr.stat[U] / cpi->count, + cpi->psnr.stat[V] / cpi->count); if (cpi->b_calculate_blockiness) { SNPRINT(headings, "\t Block\tWstBlck"); @@ -2557,7 +2531,7 @@ int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd) { YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag); if (cfg) { - vp8_yv12_copy_frame(cfg, sd); + vpx_yv12_copy_frame(cfg, sd); return 0; } else { return -1; @@ -2568,7 +2542,7 @@ int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd) { YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag); if (cfg) { - vp8_yv12_copy_frame(sd, cfg); + vpx_yv12_copy_frame(sd, cfg); return 0; } else { return -1; @@ -2581,38 +2555,6 @@ int vp9_update_entropy(VP9_COMP *cpi, int update) { return 0; } -#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP) -// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it -// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do -// not denoise the UV channels at this time. If ever we implement UV channel -// denoising we will have to modify this. -void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) { - uint8_t *src = s->y_buffer; - int h = s->y_height; - - do { - fwrite(src, s->y_width, 1, f); - src += s->y_stride; - } while (--h); - - src = s->u_buffer; - h = s->uv_height; - - do { - fwrite(src, s->uv_width, 1, f); - src += s->uv_stride; - } while (--h); - - src = s->v_buffer; - h = s->uv_height; - - do { - fwrite(src, s->uv_width, 1, f); - src += s->uv_stride; - } while (--h); -} -#endif - #ifdef OUTPUT_YUV_REC void vp9_write_yuv_rec_frame(VP9_COMMON *cm) { YV12_BUFFER_CONFIG *s = cm->frame_to_show; @@ -2748,15 +2690,14 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, if (src->flags & YV12_FLAG_HIGHBITDEPTH) { vpx_highbd_convolve8(CONVERT_TO_SHORTPTR(src_ptr), src_stride, - CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, - kernel[x_q4 & 0xf], 16 * src_w / dst_w, - kernel[y_q4 & 0xf], 16 * src_h / dst_h, - 16 / factor, 16 / factor, bd); + CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, kernel, + x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, + 16 * src_h / dst_h, 16 / factor, 16 / factor, + bd); } else { - vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, - kernel[x_q4 & 0xf], 16 * src_w / dst_w, - kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor, - 16 / factor); + vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel, + x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, + 16 * src_h / dst_h, 16 / factor, 16 / factor); } } } @@ -2782,11 +2723,33 @@ static int scale_down(VP9_COMP *cpi, int q) { return scale; } -static int big_rate_miss(VP9_COMP *cpi, int high_limit, int low_limit) { +static int big_rate_miss_high_threshold(VP9_COMP *cpi) { const RATE_CONTROL *const rc = &cpi->rc; + int big_miss_high; - return (rc->projected_frame_size > ((high_limit * 3) / 2)) || - (rc->projected_frame_size < (low_limit / 2)); + if (frame_is_kf_gf_arf(cpi)) + big_miss_high = rc->this_frame_target * 3 / 2; + else + big_miss_high = rc->this_frame_target * 2; + + return big_miss_high; +} + +static int big_rate_miss(VP9_COMP *cpi) { + const RATE_CONTROL *const rc = &cpi->rc; + int big_miss_high; + int big_miss_low; + + // Ignore for overlay frames + if (rc->is_src_frame_alt_ref) { + return 0; + } else { + big_miss_low = (rc->this_frame_target / 2); + big_miss_high = big_rate_miss_high_threshold(cpi); + + return (rc->projected_frame_size > big_miss_high) || + (rc->projected_frame_size < big_miss_low); + } } // test in two pass for the first @@ -2811,8 +2774,7 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q, int force_recode = 0; if ((rc->projected_frame_size >= rc->max_frame_bandwidth) || - big_rate_miss(cpi, high_limit, low_limit) || - (cpi->sf.recode_loop == ALLOW_RECODE) || + big_rate_miss(cpi) || (cpi->sf.recode_loop == ALLOW_RECODE) || (two_pass_first_group_inter(cpi) && (cpi->sf.recode_loop == ALLOW_RECODE_FIRST)) || (frame_is_kfgfarf && (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF))) { @@ -2822,8 +2784,13 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q, cpi->resize_pending = 1; return 1; } - // Force recode if projected_frame_size > max_frame_bandwidth - if (rc->projected_frame_size >= rc->max_frame_bandwidth) return 1; + + // Force recode for extreme overshoot. + if ((rc->projected_frame_size >= rc->max_frame_bandwidth) || + (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF && + rc->projected_frame_size >= big_rate_miss_high_threshold(cpi))) { + return 1; + } // TODO(agrange) high_limit could be greater than the scale-down threshold. if ((rc->projected_frame_size > high_limit && q < maxq) || @@ -2914,17 +2881,38 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && cpi->denoiser.denoising_level > kDenLowLow) { int svc_base_is_key = 0; + int denoise_svc_second_layer = 0; if (cpi->use_svc) { + int realloc_fail = 0; + const int svc_buf_shift = + cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 + ? cpi->denoiser.num_ref_frames + : 0; int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id, cpi->svc.number_temporal_layers); LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; svc_base_is_key = lc->is_key_frame; + denoise_svc_second_layer = + cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 ? 1 + : 0; + // Check if we need to allocate extra buffers in the denoiser + // for + // refreshed frames. + realloc_fail = vp9_denoiser_realloc_svc( + cm, &cpi->denoiser, svc_buf_shift, cpi->refresh_alt_ref_frame, + cpi->refresh_golden_frame, cpi->refresh_last_frame, cpi->alt_fb_idx, + cpi->gld_fb_idx, cpi->lst_fb_idx); + if (realloc_fail) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to re-allocate denoiser for SVC"); } vp9_denoiser_update_frame_info( &cpi->denoiser, *cpi->Source, cpi->common.frame_type, cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame, - cpi->refresh_last_frame, cpi->resize_pending, svc_base_is_key); + cpi->refresh_last_frame, cpi->alt_fb_idx, cpi->gld_fb_idx, + cpi->lst_fb_idx, cpi->resize_pending, svc_base_is_key, + denoise_svc_second_layer); } #endif if (is_one_pass_cbr_svc(cpi)) { @@ -3195,15 +3183,37 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { dc_quant_devisor = 4.0; #endif - fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d" - "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" " - "%10"PRId64" %10"PRId64" %10d " - "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf" - "%6d %6d %5d %5d %5d " - "%10"PRId64" %10.3lf" - "%10lf %8u %10"PRId64" %10d %10d %10d %10d %10d\n", + if (!cm->current_video_frame) { + fprintf(f, "frame, width, height, last ts, last end ts, " + "source_alt_ref_pending, source_alt_ref_active, " + "this_frame_target, projected_frame_size, " + "projected_frame_size / MBs, " + "projected_frame_size - this_frame_target, " + "vbr_bits_off_target, vbr_bits_off_target_fast, " + "twopass.extend_minq, twopass.extend_minq_fast, " + "total_target_vs_actual, " + "starting_buffer_level - bits_off_target, " + "total_actual_bits, base_qindex, q for base_qindex, " + "dc quant, q for active_worst_quality, avg_q, q for oxcf.cq_level, " + "refresh_last_frame, refresh_golden_frame, refresh_alt_ref_frame, " + "frame_type, gfu_boost, " + "twopass.bits_left, " + "twopass.total_left_stats.coded_error, " + "twopass.bits_left / (1 + twopass.total_left_stats.coded_error), " + "tot_recode_hits, recon_err, kf_boost, " + "twopass.kf_zeromotion_pct, twopass.fr_content_type, " + "filter_level, seg.aq_av_offset\n"); + } + + fprintf(f, "%10u, %d, %d, %10"PRId64", %10"PRId64", %d, %d, %10d, %10d, " + "%10d, %10d, %10"PRId64", %10"PRId64", %5d, %5d, %10"PRId64", " + "%10"PRId64", %10"PRId64", %10d, %7.2lf, %7.2lf, %7.2lf, %7.2lf, " + "%7.2lf, %6d, %6d, %5d, %5d, %5d, %10"PRId64", %10.3lf, %10lf, %8u, " + "%10"PRId64", %10d, %10d, %10d, %10d, %10d\n", cpi->common.current_video_frame, cm->width, cm->height, + cpi->last_time_stamp_seen, + cpi->last_end_time_stamp_seen, cpi->rc.source_alt_ref_pending, cpi->rc.source_alt_ref_active, cpi->rc.this_frame_target, @@ -3291,7 +3301,6 @@ static void set_size_independent_vars(VP9_COMP *cpi) { static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index, int *top_index) { VP9_COMMON *const cm = &cpi->common; - const VP9EncoderConfig *const oxcf = &cpi->oxcf; // Setup variables that depend on the dimensions of the frame. vp9_set_speed_features_framesize_dependent(cpi); @@ -3303,17 +3312,19 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index, vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH); } +#if !CONFIG_REALTIME_ONLY // Configure experimental use of segmentation for enhanced coding of // static regions if indicated. // Only allowed in the second pass of a two pass encode, as it requires // lagged coding, and if the relevant speed feature flag is set. - if (oxcf->pass == 2 && cpi->sf.static_segmentation) + if (cpi->oxcf.pass == 2 && cpi->sf.static_segmentation) configure_static_seg_features(cpi); +#endif // !CONFIG_REALTIME_ONLY #if CONFIG_VP9_POSTPROC && !(CONFIG_VP9_TEMPORAL_DENOISING) - if (oxcf->noise_sensitivity > 0) { + if (cpi->oxcf.noise_sensitivity > 0) { int l = 0; - switch (oxcf->noise_sensitivity) { + switch (cpi->oxcf.noise_sensitivity) { case 1: l = 20; break; case 2: l = 40; break; case 3: l = 60; break; @@ -3336,7 +3347,8 @@ static void setup_denoiser_buffer(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; if (cpi->oxcf.noise_sensitivity > 0 && !cpi->denoiser.frame_buffer_initialized) { - if (vp9_denoiser_alloc(&cpi->denoiser, cm->width, cm->height, + if (vp9_denoiser_alloc(cm, &cpi->svc, &cpi->denoiser, cpi->use_svc, + cpi->oxcf.noise_sensitivity, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, @@ -3364,6 +3376,7 @@ static void set_frame_size(VP9_COMP *cpi) { VP9EncoderConfig *const oxcf = &cpi->oxcf; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; +#if !CONFIG_REALTIME_ONLY if (oxcf->pass == 2 && oxcf->rc_mode == VPX_VBR && ((oxcf->resize_mode == RESIZE_FIXED && cm->current_video_frame == 0) || (oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending))) { @@ -3374,6 +3387,7 @@ static void set_frame_size(VP9_COMP *cpi) { vp9_set_size_literal(cpi, oxcf->scaled_frame_width, oxcf->scaled_frame_height); } +#endif // !CONFIG_REALTIME_ONLY if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR && !cpi->use_svc && oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending != 0) { @@ -3466,8 +3480,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, // Flag to check if its valid to compute the source sad (used for // scene detection and for superblock content state in CBR mode). // The flag may get reset below based on SVC or resizing state. - cpi->compute_source_sad_onepass = - cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5 && cm->show_frame; + cpi->compute_source_sad_onepass = cpi->oxcf.mode == REALTIME; vpx_clear_system_state(); @@ -3518,6 +3531,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, if ((cpi->use_svc && (cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1 || + cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1 || cpi->svc.current_superframe < 1)) || cpi->resize_pending || cpi->resize_state || cpi->external_resize || cpi->resize_state != ORIG) { @@ -3555,12 +3569,14 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, vp9_update_noise_estimate(cpi); - // Scene detection is used for VBR mode or screen-content case. - // Make sure compute_source_sad_onepass is set (which handles SVC case - // and dynamic resize). - if (cpi->compute_source_sad_onepass && + // Scene detection is always used for VBR mode or screen-content case. + // For other cases (e.g., CBR mode) use it for 5 <= speed < 8 for now + // (need to check encoding time cost for doing this for speed 8). + cpi->rc.high_source_sad = 0; + if (cpi->compute_source_sad_onepass && cm->show_frame && (cpi->oxcf.rc_mode == VPX_VBR || - cpi->oxcf.content == VP9E_CONTENT_SCREEN)) + cpi->oxcf.content == VP9E_CONTENT_SCREEN || + (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8 && !cpi->use_svc))) vp9_scene_detection_onepass(cpi); // For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame @@ -3576,6 +3592,16 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, if (cpi->sf.copy_partition_flag) alloc_copy_partition_data(cpi); + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) { + if (cpi->svc.prev_partition_svc == NULL) { + CHECK_MEM_ERROR( + cm, cpi->svc.prev_partition_svc, + (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows, + sizeof(*cpi->svc.prev_partition_svc))); + } + } + if (cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && cpi->oxcf.content != VP9E_CONTENT_SCREEN && @@ -3660,6 +3686,7 @@ static int get_qstep_adj(int rate_excess, int rate_limit) { static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; int bottom_index, top_index; @@ -3696,9 +3723,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, qrange_adj = VPXMAX(1, (top_index - bottom_index) / 2); bottom_index = - VPXMAX(bottom_index - qrange_adj / 2, cpi->oxcf.best_allowed_q); - top_index = - VPXMIN(cpi->oxcf.worst_allowed_q, top_index + qrange_adj / 2); + VPXMAX(bottom_index - qrange_adj / 2, oxcf->best_allowed_q); + top_index = VPXMIN(oxcf->worst_allowed_q, top_index + qrange_adj / 2); } #endif // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed. @@ -3726,7 +3752,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source, - (cpi->oxcf.pass == 0), EIGHTTAP, 0); + (oxcf->pass == 0), EIGHTTAP, 0); // Unfiltered raw source used in metrics calculation if the source // has been filtered. @@ -3735,7 +3761,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, if (is_spatial_denoise_enabled(cpi)) { cpi->raw_source_frame = vp9_scale_if_required( cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source, - (cpi->oxcf.pass == 0), EIGHTTAP, 0); + (oxcf->pass == 0), EIGHTTAP, 0); } else { cpi->raw_source_frame = cpi->Source; } @@ -3745,9 +3771,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, } if (cpi->unscaled_last_source != NULL) - cpi->Last_Source = vp9_scale_if_required( - cm, cpi->unscaled_last_source, &cpi->scaled_last_source, - (cpi->oxcf.pass == 0), EIGHTTAP, 0); + cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source, + &cpi->scaled_last_source, + (oxcf->pass == 0), EIGHTTAP, 0); if (frame_is_intra_only(cm) == 0) { if (loop_count > 0) { @@ -3762,13 +3788,13 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, // Variance adaptive and in frame q adjustment experiments are mutually // exclusive. - if (cpi->oxcf.aq_mode == VARIANCE_AQ) { + if (oxcf->aq_mode == VARIANCE_AQ) { vp9_vaq_frame_setup(cpi); - } else if (cpi->oxcf.aq_mode == EQUATOR360_AQ) { + } else if (oxcf->aq_mode == EQUATOR360_AQ) { vp9_360aq_frame_setup(cpi); - } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { + } else if (oxcf->aq_mode == COMPLEXITY_AQ) { vp9_setup_in_frame_q_adj(cpi); - } else if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) { + } else if (oxcf->aq_mode == LOOKAHEAD_AQ) { vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi); } @@ -3792,7 +3818,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1; } - if (cpi->oxcf.rc_mode == VPX_Q) { + if (oxcf->rc_mode == VPX_Q) { loop = 0; } else { if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced && @@ -3872,11 +3898,16 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, // Frame is too large if (rc->projected_frame_size > rc->this_frame_target) { // Special case if the projected size is > the max allowed. - if (rc->projected_frame_size >= rc->max_frame_bandwidth) { + if ((q == q_high) && + ((rc->projected_frame_size >= rc->max_frame_bandwidth) || + (rc->projected_frame_size >= + big_rate_miss_high_threshold(cpi)))) { + int max_rate = VPXMAX(1, VPXMIN(rc->max_frame_bandwidth, + big_rate_miss_high_threshold(cpi))); double q_val_high; q_val_high = vp9_convert_qindex_to_q(q_high, cm->bit_depth); - q_val_high = q_val_high * ((double)rc->projected_frame_size / - rc->max_frame_bandwidth); + q_val_high = + q_val_high * ((double)rc->projected_frame_size / max_rate); q_high = vp9_convert_q_to_qindex(q_val_high, cm->bit_depth); q_high = clamp(q_high, rc->best_quality, rc->worst_quality); } @@ -3885,7 +3916,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, qstep = get_qstep_adj(rc->projected_frame_size, rc->this_frame_target); q_low = VPXMIN(q + qstep, q_high); - // q_low = q < q_high ? q + 1 : q_high; if (undershoot_seen || loop_at_this_size > 1) { // Update rate_correction_factor unless @@ -3913,31 +3943,29 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, qstep = get_qstep_adj(rc->this_frame_target, rc->projected_frame_size); q_high = VPXMAX(q - qstep, q_low); - // q_high = q > q_low ? q - 1 : q_low; if (overshoot_seen || loop_at_this_size > 1) { vp9_rc_update_rate_correction_factors(cpi); q = (q_high + q_low) / 2; } else { vp9_rc_update_rate_correction_factors(cpi); - q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, - top_index); + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, + VPXMIN(q_low, bottom_index), top_index); // Special case reset for qlow for constrained quality. // This should only trigger where there is very substantial // undershoot on a frame and the auto cq level is above // the user passsed in value. - if (cpi->oxcf.rc_mode == VPX_CQ && q < q_low) { + if (oxcf->rc_mode == VPX_CQ && q < q_low) { q_low = q; } while (q > q_high && retries < 10) { vp9_rc_update_rate_correction_factors(cpi); - q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, - top_index); + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, + VPXMIN(q_low, bottom_index), top_index); retries++; } } - undershoot_seen = 1; } @@ -3971,9 +3999,21 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, #ifdef AGGRESSIVE_VBR if (two_pass_first_group_inter(cpi)) { cpi->twopass.active_worst_quality = - VPXMIN(q + qrange_adj, cpi->oxcf.worst_allowed_q); - } + VPXMIN(q + qrange_adj, oxcf->worst_allowed_q); + } else if (!frame_is_kf_gf_arf(cpi)) { +#else + if (!frame_is_kf_gf_arf(cpi)) { #endif + // Have we been forced to adapt Q outside the expected range by an extreme + // rate miss. If so adjust the active maxQ for the subsequent frames. + if (q > cpi->twopass.active_worst_quality) { + cpi->twopass.active_worst_quality = q; + } else if (oxcf->vbr_corpus_complexity && q == q_low && + rc->projected_frame_size < rc->this_frame_target) { + cpi->twopass.active_worst_quality = + VPXMAX(q, cpi->twopass.active_worst_quality - 1); + } + } if (enable_acl) { // Skip recoding, if model diff is below threshold @@ -4448,14 +4488,14 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, #if CONFIG_VP9_TEMPORAL_DENOISING #ifdef OUTPUT_YUV_DENOISED if (oxcf->noise_sensitivity > 0 && denoise_svc(cpi)) { - vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME], - yuv_denoised_file); + vpx_write_yuv_frame(yuv_denoised_file, + &cpi->denoiser.running_avg_y[INTRA_FRAME]); } #endif #endif #ifdef OUTPUT_YUV_SKINMAP if (cpi->common.current_video_frame > 1) { - vp9_compute_skin_map(cpi, yuv_skinmap_file); + vp9_output_skin_map(cpi, yuv_skinmap_file); } #endif @@ -4592,6 +4632,7 @@ static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, encode_frame_to_data_rate(cpi, size, dest, frame_flags); } +#if !CONFIG_REALTIME_ONLY static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, unsigned int *frame_flags) { cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; @@ -4600,6 +4641,7 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING)) vp9_twopass_postencode_update(cpi); } +#endif // !CONFIG_REALTIME_ONLY static void init_ref_frame_bufs(VP9_COMMON *cm) { int i; @@ -4822,6 +4864,7 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) { int i, idx; uint64_t luma_samples, dur_end; const uint32_t luma_pic_size = cm->width * cm->height; + const uint32_t luma_pic_breadth = VPXMAX(cm->width, cm->height); LevelConstraint *const level_constraint = &cpi->level_constraint; const int8_t level_index = level_constraint->level_index; double cpb_data_size; @@ -4925,6 +4968,11 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) { level_spec->max_luma_picture_size = luma_pic_size; } + // update max_luma_picture_breadth + if (luma_pic_breadth > level_spec->max_luma_picture_breadth) { + level_spec->max_luma_picture_breadth = luma_pic_breadth; + } + // update compression_ratio level_spec->compression_ratio = (double)level_stats->total_uncompressed_size * cm->bit_depth / @@ -4945,6 +4993,15 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) { level_fail_messages[LUMA_PIC_SIZE_TOO_LARGE]); } + if (level_spec->max_luma_picture_breadth > + vp9_level_defs[level_index].max_luma_picture_breadth) { + level_constraint->fail_flag |= (1 << LUMA_PIC_BREADTH_TOO_LARGE); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[LUMA_PIC_BREADTH_TOO_LARGE]); + } + if ((double)level_spec->max_luma_sample_rate > (double)vp9_level_defs[level_index].max_luma_sample_rate * (1 + SAMPLE_RATE_GRACE_P)) { @@ -5094,7 +5151,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } cpi->svc.layer_context[cpi->svc.spatial_layer_id].has_alt_frame = 1; #endif - +#if !CONFIG_REALTIME_ONLY if ((oxcf->mode != REALTIME) && (oxcf->arnr_max_frames > 0) && (oxcf->arnr_strength > 0)) { int bitrate = cpi->rc.avg_frame_bandwidth / 40; @@ -5114,7 +5171,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, force_src_buffer = &cpi->alt_ref_buffer; } - +#endif cm->show_frame = 0; cm->intra_only = 0; cpi->refresh_alt_ref_frame = 1; @@ -5145,8 +5202,6 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cm->intra_only = 0; // if the flags indicate intra frame, but if the current picture is for // non-zero spatial layer, it should not be an intra picture. - // TODO(Won Kap): this needs to change if per-layer intra frame is - // allowed. if ((source->flags & VPX_EFLAG_FORCE_KF) && cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode) { source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF); @@ -5175,10 +5230,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } else { *size = 0; +#if !CONFIG_REALTIME_ONLY if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) { vp9_end_first_pass(cpi); /* get last stats packet */ cpi->twopass.first_pass_done = 1; } +#endif // !CONFIG_REALTIME_ONLY return -1; } @@ -5225,6 +5282,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->frame_flags = *frame_flags; +#if !CONFIG_REALTIME_ONLY if ((oxcf->pass == 2) && (!cpi->use_svc || (is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state != ENCODING))) { @@ -5232,6 +5290,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } else if (oxcf->pass == 1) { set_frame_size(cpi); } +#endif // !CONFIG_REALTIME_ONLY if (oxcf->pass != 1 && cpi->level_constraint.level_index >= 0 && cpi->level_constraint.fail_flag == 0) @@ -5242,20 +5301,28 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } cpi->td.mb.fp_src_pred = 0; +#if CONFIG_REALTIME_ONLY + if (cpi->use_svc) { + SvcEncode(cpi, size, dest, frame_flags); + } else { + // One pass encode + Pass0Encode(cpi, size, dest, frame_flags); + } +#else // !CONFIG_REALTIME_ONLY if (oxcf->pass == 1 && (!cpi->use_svc || is_two_pass_svc(cpi))) { const int lossless = is_lossless_requested(oxcf); #if CONFIG_VP9_HIGHBITDEPTH if (cpi->oxcf.use_highbitdepth) - cpi->td.mb.fwd_txm4x4 = + cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4; else - cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4; - cpi->td.mb.highbd_itxm_add = + cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4; + cpi->td.mb.highbd_inv_txfm_add = lossless ? vp9_highbd_iwht4x4_add : vp9_highbd_idct4x4_add; #else - cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4; + cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4; #endif // CONFIG_VP9_HIGHBITDEPTH - cpi->td.mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; + cpi->td.mb.inv_txfm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; vp9_first_pass(cpi, source); } else if (oxcf->pass == 2 && (!cpi->use_svc || is_two_pass_svc(cpi))) { Pass2Encode(cpi, size, dest, frame_flags); @@ -5265,6 +5332,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // One pass encode Pass0Encode(cpi, size, dest, frame_flags); } +#endif // CONFIG_REALTIME_ONLY if (cm->refresh_frame_context) cm->frame_contexts[cm->frame_context_idx] = *cm->fc; @@ -5631,7 +5699,7 @@ void vp9_set_row_mt(VP9_COMP *cpi) { cpi->row_mt = 1; } - if (cpi->row_mt && cpi->oxcf.max_threads > 1) + if (cpi->row_mt) cpi->row_mt_bit_exact = 1; else cpi->row_mt_bit_exact = 0; diff --git a/libvpx/vp9/encoder/vp9_encoder.h b/libvpx/vp9/encoder/vp9_encoder.h index 672c83bfd..d723d93cb 100644 --- a/libvpx/vp9/encoder/vp9_encoder.h +++ b/libvpx/vp9/encoder/vp9_encoder.h @@ -138,6 +138,7 @@ typedef enum { kHighSadLowSumdiff = 3, kHighSadHighSumdiff = 4, kLowVarHighSumdiff = 5, + kVeryHighSad = 6, } CONTENT_STATE_SB; typedef struct VP9EncoderConfig { @@ -208,6 +209,7 @@ typedef struct VP9EncoderConfig { int two_pass_vbrbias; // two pass datarate control tweaks int two_pass_vbrmin_section; int two_pass_vbrmax_section; + int vbr_corpus_complexity; // 0 indicates corpus vbr disabled // END DATARATE CONTROL OPTIONS // ---------------------------------------------------------------- @@ -359,6 +361,7 @@ typedef struct IMAGE_STAT { typedef enum { LEVEL_UNKNOWN = 0, + LEVEL_AUTO = 1, LEVEL_1 = 10, LEVEL_1_1 = 11, LEVEL_2 = 20, @@ -380,6 +383,7 @@ typedef struct { VP9_LEVEL level; uint64_t max_luma_sample_rate; uint32_t max_luma_picture_size; + uint32_t max_luma_picture_breadth; double average_bitrate; // in kilobits per second double max_cpb_size; // in kilobits double compression_ratio; @@ -419,14 +423,15 @@ typedef struct { typedef enum { BITRATE_TOO_LARGE = 0, - LUMA_PIC_SIZE_TOO_LARGE = 1, - LUMA_SAMPLE_RATE_TOO_LARGE = 2, - CPB_TOO_LARGE = 3, - COMPRESSION_RATIO_TOO_SMALL = 4, - TOO_MANY_COLUMN_TILE = 5, - ALTREF_DIST_TOO_SMALL = 6, - TOO_MANY_REF_BUFFER = 7, - TARGET_LEVEL_FAIL_IDS = 8 + LUMA_PIC_SIZE_TOO_LARGE, + LUMA_PIC_BREADTH_TOO_LARGE, + LUMA_SAMPLE_RATE_TOO_LARGE, + CPB_TOO_LARGE, + COMPRESSION_RATIO_TOO_SMALL, + TOO_MANY_COLUMN_TILE, + ALTREF_DIST_TOO_SMALL, + TOO_MANY_REF_BUFFER, + TARGET_LEVEL_FAIL_IDS } TARGET_LEVEL_FAIL_ID; typedef struct { @@ -541,6 +546,8 @@ typedef struct VP9_COMP { uint8_t *segmentation_map; + uint8_t *skin_map; + // segment threashold for encode breakout int segment_encode_breakout[MAX_SEGMENTS]; @@ -548,7 +555,6 @@ typedef struct VP9_COMP { ActiveMap active_map; fractional_mv_step_fp *find_fractional_mv_step; - vp9_full_search_fn_t full_search_sad; vp9_diamond_search_fn_t diamond_search_sad; vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES]; uint64_t time_receive_data; @@ -714,6 +720,9 @@ typedef struct VP9_COMP { int compute_source_sad_onepass; LevelConstraint level_constraint; + + uint8_t *count_arf_frame_usage; + uint8_t *count_lastgolden_frame_usage; } VP9_COMP; void vp9_initialize_enc(void); @@ -861,13 +870,14 @@ static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) { static INLINE int denoise_svc(const struct VP9_COMP *const cpi) { return (!cpi->use_svc || (cpi->use_svc && - cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)); + cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise)); } #endif +#define MIN_LOOKAHEAD_FOR_ARFS 4 static INLINE int is_altref_enabled(const VP9_COMP *const cpi) { return !(cpi->oxcf.mode == REALTIME && cpi->oxcf.rc_mode == VPX_CBR) && - cpi->oxcf.lag_in_frames > 0 && + cpi->oxcf.lag_in_frames >= MIN_LOOKAHEAD_FOR_ARFS && (cpi->oxcf.enable_auto_arf && (!is_two_pass_svc(cpi) || cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id])); @@ -910,6 +920,22 @@ static INLINE int get_level_index(VP9_LEVEL level) { return -1; } +// Return the log2 value of max column tiles corresponding to the level that +// the picture size fits into. +static INLINE int log_tile_cols_from_picsize_level(uint32_t width, + uint32_t height) { + int i; + const uint32_t pic_size = width * height; + const uint32_t pic_breadth = VPXMAX(width, height); + for (i = LEVEL_1; i < LEVEL_MAX; ++i) { + if (vp9_level_defs[i].max_luma_picture_size >= pic_size && + vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) { + return get_msb(vp9_level_defs[i].max_col_tiles); + } + } + return INT_MAX; +} + VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec); void vp9_new_framerate(VP9_COMP *cpi, double framerate); diff --git a/libvpx/vp9/encoder/vp9_ethread.c b/libvpx/vp9/encoder/vp9_ethread.c index 51664112a..0bd2e2145 100644 --- a/libvpx/vp9/encoder/vp9_ethread.c +++ b/libvpx/vp9/encoder/vp9_ethread.c @@ -35,7 +35,8 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { td_t->rd_counts.coef_counts[i][j][k][l][m][n]; } -static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) { +static int enc_worker_hook(void *arg1, void *unused) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; VP9_COMP *const cpi = thread_data->cpi; const VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; @@ -64,6 +65,13 @@ static int get_max_tile_cols(VP9_COMP *cpi) { vp9_get_tile_n_bits(mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); log2_tile_cols = clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols); + if (cpi->oxcf.target_level == LEVEL_AUTO) { + const int level_tile_cols = + log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height); + if (log2_tile_cols > level_tile_cols) { + log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols); + } + } return (1 << log2_tile_cols); } @@ -135,7 +143,7 @@ static void launch_enc_workers(VP9_COMP *cpi, VPxWorkerHook hook, void *data2, for (i = 0; i < num_workers; i++) { VPxWorker *const worker = &cpi->workers[i]; - worker->hook = (VPxWorkerHook)hook; + worker->hook = hook; worker->data1 = &cpi->tile_thr_data[i]; worker->data2 = data2; } @@ -203,7 +211,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { } } - launch_enc_workers(cpi, (VPxWorkerHook)enc_worker_hook, NULL, num_workers); + launch_enc_workers(cpi, enc_worker_hook, NULL, num_workers); for (i = 0; i < num_workers; i++) { VPxWorker *const worker = &cpi->workers[i]; @@ -217,6 +225,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { } } +#if !CONFIG_REALTIME_ONLY static void accumulate_fp_tile_stat(TileDataEnc *tile_data, TileDataEnc *tile_data_t) { tile_data->fp_data.intra_factor += tile_data_t->fp_data.intra_factor; @@ -251,6 +260,7 @@ static void accumulate_fp_tile_stat(TileDataEnc *tile_data, : VPXMIN(tile_data->fp_data.image_data_start_row, tile_data_t->fp_data.image_data_start_row); } +#endif // !CONFIG_REALTIME_ONLY // Allocate memory for row synchronization void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm, @@ -379,6 +389,7 @@ void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c, return; } +#if !CONFIG_REALTIME_ONLY static int first_pass_worker_hook(EncWorkerData *const thread_data, MultiThreadHandle *multi_thread_ctxt) { VP9_COMP *const cpi = thread_data->cpi; @@ -545,6 +556,7 @@ void vp9_temporal_filter_row_mt(VP9_COMP *cpi) { launch_enc_workers(cpi, (VPxWorkerHook)temporal_filter_worker_hook, multi_thread_ctxt, num_workers); } +#endif // !CONFIG_REALTIME_ONLY static int enc_row_mt_worker_hook(EncWorkerData *const thread_data, MultiThreadHandle *multi_thread_ctxt) { diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c index b6e327548..fb6b132a5 100644 --- a/libvpx/vp9/encoder/vp9_firstpass.c +++ b/libvpx/vp9/encoder/vp9_firstpass.c @@ -41,9 +41,9 @@ #define OUTPUT_FPF 0 #define ARF_STATS_OUTPUT 0 +#define COMPLEXITY_STATS_OUTPUT 0 #define FIRST_PASS_Q 10.0 -#define GF_MAX_BOOST 96.0 #define INTRA_MODE_PENALTY 1024 #define MIN_ARF_GF_BOOST 240 #define MIN_DECAY_FACTOR 0.01 @@ -103,7 +103,7 @@ static void output_stats(FIRSTPASS_STATS *stats, fpfile = fopen("firstpass.stt", "a"); fprintf(fpfile, - "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf" + "%12.0lf %12.4lf %12.2lf %12.2lf %12.2lf %12.0lf %12.4lf %12.4lf" "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf" "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.0lf %12.0lf" "%12.4lf" @@ -235,16 +235,25 @@ static double calculate_active_area(const VP9_COMP *cpi, return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA); } +// Get the average weighted error for the clip (or corpus) +static double get_distribution_av_err(VP9_COMP *cpi, TWO_PASS *const twopass) { + const double av_weight = + twopass->total_stats.weight / twopass->total_stats.count; + + if (cpi->oxcf.vbr_corpus_complexity) + return av_weight * twopass->mean_mod_score; + else + return (twopass->total_stats.coded_error * av_weight) / + twopass->total_stats.count; +} + +#define ACT_AREA_CORRECTION 0.5 // Calculate a modified Error used in distributing bits between easier and // harder frames. -#define ACT_AREA_CORRECTION 0.5 static double calculate_mod_frame_score(const VP9_COMP *cpi, - const TWO_PASS *twopass, const VP9EncoderConfig *oxcf, - const FIRSTPASS_STATS *this_frame) { - const FIRSTPASS_STATS *const stats = &twopass->total_stats; - const double av_weight = stats->weight / stats->count; - const double av_err = (stats->coded_error * av_weight) / stats->count; + const FIRSTPASS_STATS *this_frame, + const double av_err) { double modified_score = av_err * pow(this_frame->coded_error * this_frame->weight / DOUBLE_DIVIDE_CHECK(av_err), @@ -260,13 +269,12 @@ static double calculate_mod_frame_score(const VP9_COMP *cpi, return modified_score; } + static double calculate_norm_frame_score(const VP9_COMP *cpi, const TWO_PASS *twopass, const VP9EncoderConfig *oxcf, - const FIRSTPASS_STATS *this_frame) { - const FIRSTPASS_STATS *const stats = &twopass->total_stats; - const double av_weight = stats->weight / stats->count; - const double av_err = (stats->coded_error * av_weight) / stats->count; + const FIRSTPASS_STATS *this_frame, + const double av_err) { double modified_score = av_err * pow(this_frame->coded_error * this_frame->weight / DOUBLE_DIVIDE_CHECK(av_err), @@ -723,8 +731,9 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps, // Exclude any image dead zone if (fp_acc_data->image_data_start_row > 0) { fp_acc_data->intra_skip_count = - VPXMAX(0, fp_acc_data->intra_skip_count - - (fp_acc_data->image_data_start_row * cm->mb_cols * 2)); + VPXMAX(0, + fp_acc_data->intra_skip_count - + (fp_acc_data->image_data_start_row * cm->mb_cols * 2)); } fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs; @@ -1583,6 +1592,7 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err, const RATE_CONTROL *const rc = &cpi->rc; const VP9EncoderConfig *const oxcf = &cpi->oxcf; TWO_PASS *const twopass = &cpi->twopass; + double last_group_rate_err; // Clamp the target rate to VBR min / max limts. const int target_rate = @@ -1591,6 +1601,18 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err, noise_factor = fclamp(noise_factor, NOISE_FACTOR_MIN, NOISE_FACTOR_MAX); inactive_zone = fclamp(inactive_zone, 0.0, 1.0); +// TODO(jimbankoski): remove #if here or below when this has been +// well tested. +#if CONFIG_ALWAYS_ADJUST_BPM + // based on recent history adjust expectations of bits per macroblock. + last_group_rate_err = + (double)twopass->rolling_arf_group_actual_bits / + DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits); + last_group_rate_err = VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err)); + twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0; + twopass->bpm_factor = VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor)); +#endif + if (target_rate <= 0) { return rc->worst_quality; // Highest value allowed } else { @@ -1601,11 +1623,13 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err, const int active_mbs = (int)VPXMAX(1, (double)num_mbs * active_pct); const double av_err_per_mb = section_err / active_pct; const double speed_term = 1.0 + 0.04 * oxcf->speed; - double last_group_rate_err; const int target_norm_bits_per_mb = (int)(((uint64_t)target_rate << BPER_MB_NORMBITS) / active_mbs); int q; +// TODO(jimbankoski): remove #if here or above when this has been +// well tested. +#if !CONFIG_ALWAYS_ADJUST_BPM // based on recent history adjust expectations of bits per macroblock. last_group_rate_err = (double)twopass->rolling_arf_group_actual_bits / @@ -1613,6 +1637,7 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err, last_group_rate_err = VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err)); twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0; twopass->bpm_factor = VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor)); +#endif // Try and pick a max Q that will be high enough to encode the // content at the given rate. @@ -1666,7 +1691,7 @@ void calculate_coded_size(VP9_COMP *cpi, int *scaled_frame_width, void vp9_init_second_pass(VP9_COMP *cpi) { SVC *const svc = &cpi->svc; - const VP9EncoderConfig *const oxcf = &cpi->oxcf; + VP9EncoderConfig *const oxcf = &cpi->oxcf; const int is_two_pass_svc = (svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1); RATE_CONTROL *const rc = &cpi->rc; @@ -1686,6 +1711,63 @@ void vp9_init_second_pass(VP9_COMP *cpi) { *stats = *twopass->stats_in_end; twopass->total_left_stats = *stats; + // Scan the first pass file and calculate a modified score for each + // frame that is used to distribute bits. The modified score is assumed + // to provide a linear basis for bit allocation. I.e a frame A with a score + // that is double that of frame B will be allocated 2x as many bits. + { + double modified_score_total = 0.0; + const FIRSTPASS_STATS *s = twopass->stats_in; + double av_err; + + if (oxcf->vbr_corpus_complexity) { + twopass->mean_mod_score = (double)oxcf->vbr_corpus_complexity / 10.0; + av_err = get_distribution_av_err(cpi, twopass); + } else { + av_err = get_distribution_av_err(cpi, twopass); + // The first scan is unclamped and gives a raw average. + while (s < twopass->stats_in_end) { + modified_score_total += calculate_mod_frame_score(cpi, oxcf, s, av_err); + ++s; + } + + // The average error from this first scan is used to define the midpoint + // error for the rate distribution function. + twopass->mean_mod_score = + modified_score_total / DOUBLE_DIVIDE_CHECK(stats->count); + } + + // Second scan using clamps based on the previous cycle average. + // This may modify the total and average somewhat but we dont bother with + // further itterations. + modified_score_total = 0.0; + s = twopass->stats_in; + while (s < twopass->stats_in_end) { + modified_score_total += + calculate_norm_frame_score(cpi, twopass, oxcf, s, av_err); + ++s; + } + twopass->normalized_score_left = modified_score_total; + + // If using Corpus wide VBR mode then update the clip target bandwidth to + // reflect how the clip compares to the rest of the corpus. + if (oxcf->vbr_corpus_complexity) { + oxcf->target_bandwidth = + (int64_t)((double)oxcf->target_bandwidth * + (twopass->normalized_score_left / stats->count)); + } + +#if COMPLEXITY_STATS_OUTPUT + { + FILE *compstats; + compstats = fopen("complexity_stats.stt", "a"); + fprintf(compstats, "%10.3lf\n", + twopass->normalized_score_left / stats->count); + fclose(compstats); + } +#endif + } + frame_rate = 10000000.0 * stats->count / stats->duration; // Each frame can have a different duration, as the frame rate in the source // isn't guaranteed to be constant. The frame rate prior to the first frame @@ -1708,37 +1790,6 @@ void vp9_init_second_pass(VP9_COMP *cpi) { // This variable monitors how far behind the second ref update is lagging. twopass->sr_update_lag = 1; - // Scan the first pass file and calculate a modified score for each - // frame that is used to distribute bits. The modified score is assumed - // to provide a linear basis for bit allocation. I.e a frame A with a score - // that is double that of frame B will be allocated 2x as many bits. - { - const FIRSTPASS_STATS *s = twopass->stats_in; - double modified_score_total = 0.0; - - // The first scan is unclamped and gives a raw average. - while (s < twopass->stats_in_end) { - modified_score_total += calculate_mod_frame_score(cpi, twopass, oxcf, s); - ++s; - } - - // The average error from this first scan is used to define the midpoint - // error for the rate distribution function. - twopass->mean_mod_score = - modified_score_total / DOUBLE_DIVIDE_CHECK(stats->count); - - // Second scan using clamps based on the previous cycle average. - // This may modify the total and average somewhat but we dont bother with - // further itterations. - s = twopass->stats_in; - modified_score_total = 0.0; - while (s < twopass->stats_in_end) { - modified_score_total += calculate_norm_frame_score(cpi, twopass, oxcf, s); - ++s; - } - twopass->normalized_score_left = modified_score_total; - } - // Reset the vbr bits off target counters rc->vbr_bits_off_target = 0; rc->vbr_bits_off_target_fast = 0; @@ -1897,9 +1948,9 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats, } #define BASELINE_ERR_PER_MB 12500.0 +#define GF_MAX_BOOST 96.0 static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame, - double *sr_accumulator, - double this_frame_mv_in_out, double max_boost) { + double this_frame_mv_in_out) { double frame_boost; const double lq = vp9_convert_qindex_to_q( cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth); @@ -1908,13 +1959,7 @@ static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame, // Underlying boost factor is based on inter error ratio. frame_boost = (BASELINE_ERR_PER_MB * active_area) / - DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator); - - // Update the accumulator for second ref error difference. - // This is intended to give an indication of how much the coded error is - // increasing over time. - *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error); - *sr_accumulator = VPXMAX(0.0, *sr_accumulator); + DOUBLE_DIVIDE_CHECK(this_frame->coded_error); // Small adjustment for cases where there is a zoom out if (this_frame_mv_in_out > 0.0) @@ -1923,7 +1968,7 @@ static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame, // Q correction and scalling frame_boost = frame_boost * boost_q_correction; - return VPXMIN(frame_boost, max_boost * boost_q_correction); + return VPXMIN(frame_boost, GF_MAX_BOOST * boost_q_correction); } #define KF_BASELINE_ERR_PER_MB 12500.0 @@ -1958,8 +2003,7 @@ static double calc_kf_frame_boost(VP9_COMP *cpi, return VPXMIN(frame_boost, max_boost * boost_q_correction); } -static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, - int *f_boost, int *b_boost) { +static int calc_arf_boost(VP9_COMP *cpi, int f_frames, int b_frames) { TWO_PASS *const twopass = &cpi->twopass; int i; double boost_score = 0.0; @@ -1968,13 +2012,12 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, double this_frame_mv_in_out = 0.0; double mv_in_out_accumulator = 0.0; double abs_mv_in_out_accumulator = 0.0; - double sr_accumulator = 0.0; int arf_boost; int flash_detected = 0; // Search forward from the proposed arf/next gf position. for (i = 0; i < f_frames; ++i) { - const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset); + const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i); if (this_frame == NULL) break; // Update the motion related elements to the boost calculation. @@ -1984,8 +2027,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, // We want to discount the flash frame itself and the recovery // frame that follows as both will have poor scores. - flash_detected = detect_flash(twopass, i + offset) || - detect_flash(twopass, i + offset + 1); + flash_detected = detect_flash(twopass, i) || detect_flash(twopass, i + 1); // Accumulate the effect of prediction quality decay. if (!flash_detected) { @@ -1994,14 +2036,11 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, ? MIN_DECAY_FACTOR : decay_accumulator; } - - sr_accumulator = 0.0; boost_score += decay_accumulator * - calc_frame_boost(cpi, this_frame, &sr_accumulator, - this_frame_mv_in_out, GF_MAX_BOOST); + calc_frame_boost(cpi, this_frame, this_frame_mv_in_out); } - *f_boost = (int)boost_score; + arf_boost = (int)boost_score; // Reset for backward looking loop. boost_score = 0.0; @@ -2010,11 +2049,10 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, this_frame_mv_in_out = 0.0; mv_in_out_accumulator = 0.0; abs_mv_in_out_accumulator = 0.0; - sr_accumulator = 0.0; // Search backward towards last gf position. for (i = -1; i >= -b_frames; --i) { - const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset); + const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i); if (this_frame == NULL) break; // Update the motion related elements to the boost calculation. @@ -2024,8 +2062,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, // We want to discount the the flash frame itself and the recovery // frame that follows as both will have poor scores. - flash_detected = detect_flash(twopass, i + offset) || - detect_flash(twopass, i + offset + 1); + flash_detected = detect_flash(twopass, i) || detect_flash(twopass, i + 1); // Cumulative effect of prediction quality decay. if (!flash_detected) { @@ -2034,17 +2071,13 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, ? MIN_DECAY_FACTOR : decay_accumulator; } - - sr_accumulator = 0.0; boost_score += decay_accumulator * - calc_frame_boost(cpi, this_frame, &sr_accumulator, - this_frame_mv_in_out, GF_MAX_BOOST); + calc_frame_boost(cpi, this_frame, this_frame_mv_in_out); } - *b_boost = (int)boost_score; + arf_boost += (int)boost_score; - arf_boost = (*f_boost + *b_boost); - if (arf_boost < ((b_frames + f_frames) * 20)) - arf_boost = ((b_frames + f_frames) * 20); + if (arf_boost < ((b_frames + f_frames) * 40)) + arf_boost = ((b_frames + f_frames) * 40); arf_boost = VPXMAX(arf_boost, MIN_ARF_GF_BOOST); return arf_boost; @@ -2105,7 +2138,7 @@ static int calculate_boost_bits(int frame_count, int boost, int allocation_chunks; // return 0 for invalid inputs (could arise e.g. through rounding errors) - if (!boost || (total_group_bits <= 0) || (frame_count <= 0)) return 0; + if (!boost || (total_group_bits <= 0) || (frame_count < 0)) return 0; allocation_chunks = (frame_count * 100) + boost; @@ -2133,8 +2166,33 @@ static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) { arf_buffer_indices[1] = ARF_SLOT2; } +// Used in corpus vbr: Calculates the total normalized group complexity score +// for a given number of frames starting at the current position in the stats +// file. +static double calculate_group_score(VP9_COMP *cpi, double av_score, + int frame_count) { + VP9EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + const FIRSTPASS_STATS *s = twopass->stats_in; + double score_total = 0.0; + int i = 0; + + // We dont ever want to return a 0 score here. + if (frame_count == 0) return 1.0; + + while ((i < frame_count) && (s < twopass->stats_in_end)) { + score_total += calculate_norm_frame_score(cpi, twopass, oxcf, s, av_score); + ++s; + ++i; + } + assert(i == frame_count); + + return score_total; +} + static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, int gf_arf_bits) { + VP9EncoderConfig *const oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; TWO_PASS *const twopass = &cpi->twopass; GF_GROUP *const gf_group = &twopass->gf_group; @@ -2143,7 +2201,7 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, int frame_index = 1; int target_frame_size; int key_frame; - const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf); + const int max_bits = frame_max_bits(&cpi->rc, oxcf); int64_t total_group_bits = gf_group_bits; int mid_boost_bits = 0; int mid_frame_idx; @@ -2153,8 +2211,10 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1; int normal_frames; int normal_frame_bits; - int last_frame_bits; - int last_frame_reduction; + int last_frame_reduction = 0; + double av_score = 1.0; + double tot_norm_frame_score = 1.0; + double this_frame_score = 1.0; // Only encode alt reference frame in temporal base layer. if (has_temporal_layers) alt_frame_index = cpi->svc.number_temporal_layers; @@ -2226,17 +2286,14 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1; normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending); - - // The last frame in the group is used less as a predictor so reduce - // its allocation a little. - if (normal_frames > 1) { + if (normal_frames > 1) normal_frame_bits = (int)(total_group_bits / normal_frames); - last_frame_reduction = normal_frame_bits / 16; - last_frame_bits = normal_frame_bits - last_frame_reduction; - } else { + else normal_frame_bits = (int)total_group_bits; - last_frame_bits = normal_frame_bits; - last_frame_reduction = 0; + + if (oxcf->vbr_corpus_complexity) { + av_score = get_distribution_av_err(cpi, twopass); + tot_norm_frame_score = calculate_group_score(cpi, av_score, normal_frames); } // Allocate bits to the other frames in the group. @@ -2248,11 +2305,18 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, ++frame_index; } - target_frame_size = (i == (normal_frames - 1)) - ? last_frame_bits - : (i == mid_frame_idx) - ? normal_frame_bits + last_frame_reduction - : normal_frame_bits; + if (oxcf->vbr_corpus_complexity) { + this_frame_score = calculate_norm_frame_score(cpi, twopass, oxcf, + &frame_stats, av_score); + normal_frame_bits = (int)((double)total_group_bits * + (this_frame_score / tot_norm_frame_score)); + } + + target_frame_size = normal_frame_bits; + if ((i == (normal_frames - 1)) && (i >= 1)) { + last_frame_reduction = normal_frame_bits / 16; + target_frame_size -= last_frame_reduction; + } if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) { mid_boost_bits += (target_frame_size >> 4); @@ -2273,6 +2337,9 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, ++frame_index; } + // Add in some extra bits for the middle frame in the group. + gf_group->bit_allocation[mid_frame_idx] += last_frame_reduction; + // Note: // We need to configure the frame at the end of the sequence + 1 that will be // the start frame for the next group. Otherwise prior to the call to @@ -2316,6 +2383,8 @@ static void adjust_group_arnr_filter(VP9_COMP *cpi, double section_noise, // Analyse and define a gf/arf group. #define ARF_DECAY_BREAKOUT 0.10 +#define ARF_ABS_ZOOM_THRESH 4.0 + static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; @@ -2325,8 +2394,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { const FIRSTPASS_STATS *const start_pos = twopass->stats_in; int i; - double boost_score = 0.0; - double old_boost_score = 0.0; double gf_group_err = 0.0; double gf_group_raw_error = 0.0; double gf_group_noise = 0.0; @@ -2338,7 +2405,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { double mod_frame_err = 0.0; double mv_ratio_accumulator = 0.0; - double decay_accumulator = 1.0; double zero_motion_accumulator = 1.0; double loop_decay_rate = 1.00; double last_loop_decay_rate = 1.00; @@ -2347,13 +2413,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { double mv_in_out_accumulator = 0.0; double abs_mv_in_out_accumulator = 0.0; double mv_ratio_accumulator_thresh; - double mv_in_out_thresh; double abs_mv_in_out_thresh; double sr_accumulator = 0.0; + const double av_err = get_distribution_av_err(cpi, twopass); unsigned int allow_alt_ref = is_altref_enabled(cpi); - int f_boost = 0; - int b_boost = 0; int flash_detected; int active_max_gf_interval; int active_min_gf_interval; @@ -2372,7 +2436,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { vp9_zero(next_frame); // Load stats for the current frame. - mod_frame_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame); + mod_frame_err = + calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err); // Note the error of the frame at the start of the group. This will be // the GF frame error if we code a normal gf. @@ -2393,8 +2458,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Motion breakout threshold for loop below depends on image size. mv_ratio_accumulator_thresh = (cpi->initial_height + cpi->initial_width) / 4.0; - mv_in_out_thresh = (cpi->initial_height + cpi->initial_width) / 300.0; - abs_mv_in_out_thresh = (cpi->initial_height + cpi->initial_width) / 200.0; + abs_mv_in_out_thresh = ARF_ABS_ZOOM_THRESH; // Set a maximum and minimum interval for the GF group. // If the image appears almost completely static we can extend beyond this. @@ -2438,7 +2502,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { ++i; // Accumulate error score of frames in this gf group. - mod_frame_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame); + mod_frame_err = + calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err); gf_group_err += mod_frame_err; gf_group_raw_error += this_frame->coded_error; gf_group_noise += this_frame->frame_noise_energy; @@ -2463,8 +2528,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { last_loop_decay_rate = loop_decay_rate; loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); - decay_accumulator = decay_accumulator * loop_decay_rate; - // Monitor for static sections. zero_motion_accumulator = VPXMIN( zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); @@ -2476,13 +2539,16 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { allow_alt_ref = 0; break; } - } - // Calculate a boost number for this frame. - sr_accumulator = 0.0; - boost_score += decay_accumulator * - calc_frame_boost(cpi, &next_frame, &sr_accumulator, - this_frame_mv_in_out, GF_MAX_BOOST); + // Update the accumulator for second ref error difference. + // This is intended to give an indication of how much the coded error is + // increasing over time. + if (i == 1) { + sr_accumulator += next_frame.coded_error; + } else { + sr_accumulator += (next_frame.sr_coded_error - next_frame.coded_error); + } + } // Break out conditions. if ( @@ -2496,14 +2562,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { (!flash_detected) && ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) || (abs_mv_in_out_accumulator > abs_mv_in_out_thresh) || - (mv_in_out_accumulator < -mv_in_out_thresh) || - (decay_accumulator < ARF_DECAY_BREAKOUT)))) { - boost_score = old_boost_score; + (sr_accumulator > next_frame.intra_error)))) { break; } *this_frame = next_frame; - old_boost_score = boost_score; } // Was the group length constrained by the requirement for a new KF? @@ -2512,9 +2575,12 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Should we use the alternate reference frame. if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval)) { + const int forward_frames = (rc->frames_to_key - i >= i - 1) + ? i - 1 + : VPXMAX(0, rc->frames_to_key - i); + // Calculate the boost for alt ref. - rc->gfu_boost = - calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost); + rc->gfu_boost = calc_arf_boost(cpi, forward_frames, (i - 1)); rc->source_alt_ref_pending = 1; // Test to see if multi arf is appropriate. @@ -2524,7 +2590,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { ? 1 : 0; } else { - rc->gfu_boost = VPXMAX((int)boost_score, MIN_ARF_GF_BOOST); + rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1)); rc->source_alt_ref_pending = 0; } @@ -2548,7 +2614,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) { if (EOF == input_stats(twopass, this_frame)) break; gf_group_err += - calculate_norm_frame_score(cpi, twopass, oxcf, this_frame); + calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err); gf_group_raw_error += this_frame->coded_error; gf_group_noise += this_frame->frame_noise_energy; gf_group_skip_pct += this_frame->intra_skip_pct; @@ -2587,6 +2653,12 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { group_av_noise, vbr_group_bits_per_frame); twopass->active_worst_quality = (tmp_q + (twopass->active_worst_quality * 3)) >> 2; + +#if CONFIG_ALWAYS_ADJUST_BPM + // Reset rolling actual and target bits counters for ARF groups. + twopass->rolling_arf_group_target_bits = 0; + twopass->rolling_arf_group_actual_bits = 0; +#endif } // Context Adjustment of ARNR filter strength @@ -2621,10 +2693,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Default to starting GF groups at normal frame size. cpi->rc.next_frame_size_selector = UNSCALED; } - +#if !CONFIG_ALWAYS_ADJUST_BPM // Reset rolling actual and target bits counters for ARF groups. twopass->rolling_arf_group_target_bits = 0; twopass->rolling_arf_group_actual_bits = 0; +#endif } // Threshold for use of the lagging second reference frame. High second ref @@ -2769,7 +2842,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { double kf_group_err = 0.0; double recent_loop_decay[FRAMES_TO_CHECK_DECAY]; double sr_accumulator = 0.0; - + const double av_err = get_distribution_av_err(cpi, twopass); vp9_zero(next_frame); cpi->common.frame_type = KEY_FRAME; @@ -2793,7 +2866,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->kf_group_bits = 0; // Total bits available to kf group twopass->kf_group_error_left = 0.0; // Group modified error score. - kf_mod_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame); + kf_mod_err = + calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err); // Initialize the decay rates for the recent frames to check for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0; @@ -2803,7 +2877,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { while (twopass->stats_in < twopass->stats_in_end && rc->frames_to_key < cpi->oxcf.key_freq) { // Accumulate kf group error. - kf_group_err += calculate_norm_frame_score(cpi, twopass, oxcf, this_frame); + kf_group_err += + calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err); // Load the next frame's stats. last_frame = *this_frame; @@ -2864,7 +2939,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Rescan to get the correct error data for the forced kf group. for (i = 0; i < rc->frames_to_key; ++i) { kf_group_err += - calculate_norm_frame_score(cpi, twopass, oxcf, &tmp_frame); + calculate_norm_frame_score(cpi, twopass, oxcf, &tmp_frame, av_err); input_stats(twopass, &tmp_frame); } rc->next_key_frame_forced = 1; @@ -2882,7 +2957,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) { if (EOF == input_stats(twopass, this_frame)) break; kf_group_err += - calculate_norm_frame_score(cpi, twopass, oxcf, this_frame); + calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err); } rc->frames_to_key = new_frame_to_key; } @@ -2890,7 +2965,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Special case for the last key frame of the file. if (twopass->stats_in >= twopass->stats_in_end) { // Accumulate kf group error. - kf_group_err += calculate_norm_frame_score(cpi, twopass, oxcf, this_frame); + kf_group_err += + calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err); } // Calculate the number of bits that should be assigned to the kf group. diff --git a/libvpx/vp9/encoder/vp9_frame_scale.c b/libvpx/vp9/encoder/vp9_frame_scale.c index e58628388..a410d0407 100644 --- a/libvpx/vp9/encoder/vp9_frame_scale.c +++ b/libvpx/vp9/encoder/vp9_frame_scale.c @@ -20,8 +20,6 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, INTERP_FILTER filter_type, int phase_scaler) { const int src_w = src->y_crop_width; const int src_h = src->y_crop_height; - const int dst_w = dst->y_crop_width; - const int dst_h = dst->y_crop_height; const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer, src->v_buffer }; const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; @@ -30,23 +28,86 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, const InterpKernel *const kernel = vp9_filter_kernels[filter_type]; int x, y, i; - for (i = 0; i < MAX_MB_PLANE; ++i) { - const int factor = (i == 0 || i == 3 ? 1 : 2); - const int src_stride = src_strides[i]; - const int dst_stride = dst_strides[i]; - for (y = 0; y < dst_h; y += 16) { - const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler; - for (x = 0; x < dst_w; x += 16) { - const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler; - const uint8_t *src_ptr = srcs[i] + - (y / factor) * src_h / dst_h * src_stride + - (x / factor) * src_w / dst_w; - uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor); +#if HAVE_SSSE3 || HAVE_NEON + // TODO(linfengz): The 4:3 specialized C code is disabled by default since + // it's much slower than the general version which calls vpx_scaled_2d() even + // if vpx_scaled_2d() is not optimized. It will only be enabled as a reference + // for the platforms which have faster optimization. + if (4 * dst->y_crop_width == 3 * src_w && + 4 * dst->y_crop_height == 3 * src_h) { + // Specialize 4 to 3 scaling. + // Example pixel locations. + // (O: Original pixel. S: Scaled pixel. X: Overlapped pixel.) + // phase_scaler = 0 | phase_scaler = 8 + // | + // X O S O S O X | O O O O O + // | + // | + // | S S S + // | + // | + // O O O O O | O O O O O + // | + // S S S S | + // | + // | + // | S S S + // O O O O O | O O O O O + // | + // | + // | + // S S S S | + // | + // O O O O O | O O O O O + // | S S S + // | + // | + // | + // | + // X O S O S O X | O O O O O - vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, - kernel[x_q4 & 0xf], 16 * src_w / dst_w, - kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor, - 16 / factor); + const int dst_ws[3] = { dst->y_crop_width, dst->uv_crop_width, + dst->uv_crop_width }; + const int dst_hs[3] = { dst->y_crop_height, dst->uv_crop_height, + dst->uv_crop_height }; + for (i = 0; i < MAX_MB_PLANE; ++i) { + const int dst_w = dst_ws[i]; + const int dst_h = dst_hs[i]; + const int src_stride = src_strides[i]; + const int dst_stride = dst_strides[i]; + for (y = 0; y < dst_h; y += 3) { + for (x = 0; x < dst_w; x += 3) { + const uint8_t *src_ptr = srcs[i] + 4 * y / 3 * src_stride + 4 * x / 3; + uint8_t *dst_ptr = dsts[i] + y * dst_stride + x; + + // Must call c function because its optimization doesn't support 3x3. + vpx_scaled_2d_c(src_ptr, src_stride, dst_ptr, dst_stride, kernel, + phase_scaler, 64 / 3, phase_scaler, 64 / 3, 3, 3); + } + } + } + } else +#endif + { + const int dst_w = dst->y_crop_width; + const int dst_h = dst->y_crop_height; + for (i = 0; i < MAX_MB_PLANE; ++i) { + const int factor = (i == 0 || i == 3 ? 1 : 2); + const int src_stride = src_strides[i]; + const int dst_stride = dst_strides[i]; + for (y = 0; y < dst_h; y += 16) { + const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler; + for (x = 0; x < dst_w; x += 16) { + const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler; + const uint8_t *src_ptr = srcs[i] + + (y / factor) * src_h / dst_h * src_stride + + (x / factor) * src_w / dst_w; + uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor); + + vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel, + x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, + 16 * src_h / dst_h, 16 / factor, 16 / factor); + } } } } diff --git a/libvpx/vp9/encoder/vp9_mcomp.c b/libvpx/vp9/encoder/vp9_mcomp.c index 24e23af3b..44f01be25 100644 --- a/libvpx/vp9/encoder/vp9_mcomp.c +++ b/libvpx/vp9/encoder/vp9_mcomp.c @@ -361,7 +361,7 @@ static unsigned int setup_center_error( #endif // CONFIG_VP9_HIGHBITDEPTH } -static INLINE int divide_and_round(const int n, const int d) { +static INLINE int64_t divide_and_round(const int64_t n, const int64_t d) { return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d); } @@ -379,10 +379,13 @@ static INLINE int is_cost_list_wellbehaved(int *cost_list) { // y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0). // The code below is an integerized version of that. static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) { - *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)), - (cost_list[1] - 2 * cost_list[0] + cost_list[3])); - *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)), - (cost_list[4] - 2 * cost_list[0] + cost_list[2])); + const int64_t x0 = (int64_t)cost_list[1] - cost_list[3]; + const int64_t y0 = cost_list[1] - 2 * (int64_t)cost_list[0] + cost_list[3]; + const int64_t x1 = (int64_t)cost_list[4] - cost_list[2]; + const int64_t y1 = cost_list[4] - 2 * (int64_t)cost_list[0] + cost_list[2]; + const int b = 1 << (bits - 1); + *ic = (int)divide_and_round(x0 * b, y0); + *ir = (int)divide_and_round(x1 * b, y1); } uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv, @@ -441,7 +444,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore( cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) { int ir, ic; - unsigned int minpt; + unsigned int minpt = INT_MAX; get_cost_surf_min(cost_list, &ir, &ic, 2); if (ir != 0 || ic != 0) { CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic); @@ -2039,197 +2042,6 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x, return bestsme; } -int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv, - int sad_per_bit, int distance, - const vp9_variance_fn_ptr_t *fn_ptr, - const MV *center_mv, MV *best_mv) { - int r, c; - const MACROBLOCKD *const xd = &x->e_mbd; - const struct buf_2d *const what = &x->plane[0].src; - const struct buf_2d *const in_what = &xd->plane[0].pre[0]; - const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min); - const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max); - const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min); - const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max); - const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; - int best_sad = - fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), - in_what->stride) + - mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); - *best_mv = *ref_mv; - - for (r = row_min; r < row_max; ++r) { - for (c = col_min; c < col_max; ++c) { - const MV mv = { r, c }; - const int sad = - fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv), - in_what->stride) + - mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - } - return best_sad; -} - -int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv, - int sad_per_bit, int distance, - const vp9_variance_fn_ptr_t *fn_ptr, - const MV *center_mv, MV *best_mv) { - int r; - const MACROBLOCKD *const xd = &x->e_mbd; - const struct buf_2d *const what = &x->plane[0].src; - const struct buf_2d *const in_what = &xd->plane[0].pre[0]; - const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min); - const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max); - const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min); - const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max); - const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; - unsigned int best_sad = - fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), - in_what->stride) + - mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); - *best_mv = *ref_mv; - - for (r = row_min; r < row_max; ++r) { - int c = col_min; - const uint8_t *check_here = &in_what->buf[r * in_what->stride + c]; - - if (fn_ptr->sdx3f != NULL) { - while ((c + 2) < col_max) { - int i; - DECLARE_ALIGNED(16, uint32_t, sads[3]); - - fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride, - sads); - - for (i = 0; i < 3; ++i) { - unsigned int sad = sads[i]; - if (sad < best_sad) { - const MV mv = { r, c }; - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - ++check_here; - ++c; - } - } - } - - while (c < col_max) { - unsigned int sad = - fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride); - if (sad < best_sad) { - const MV mv = { r, c }; - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - ++check_here; - ++c; - } - } - - return best_sad; -} - -int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv, - int sad_per_bit, int distance, - const vp9_variance_fn_ptr_t *fn_ptr, - const MV *center_mv, MV *best_mv) { - int r; - const MACROBLOCKD *const xd = &x->e_mbd; - const struct buf_2d *const what = &x->plane[0].src; - const struct buf_2d *const in_what = &xd->plane[0].pre[0]; - const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min); - const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max); - const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min); - const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max); - const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; - unsigned int best_sad = - fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), - in_what->stride) + - mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); - *best_mv = *ref_mv; - - for (r = row_min; r < row_max; ++r) { - int c = col_min; - const uint8_t *check_here = &in_what->buf[r * in_what->stride + c]; - - if (fn_ptr->sdx8f != NULL) { - while ((c + 7) < col_max) { - int i; - DECLARE_ALIGNED(16, uint32_t, sads[8]); - - fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride, - sads); - - for (i = 0; i < 8; ++i) { - unsigned int sad = sads[i]; - if (sad < best_sad) { - const MV mv = { r, c }; - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - ++check_here; - ++c; - } - } - } - - if (fn_ptr->sdx3f != NULL) { - while ((c + 2) < col_max) { - int i; - DECLARE_ALIGNED(16, uint32_t, sads[3]); - - fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride, - sads); - - for (i = 0; i < 3; ++i) { - unsigned int sad = sads[i]; - if (sad < best_sad) { - const MV mv = { r, c }; - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - ++check_here; - ++c; - } - } - } - - while (c < col_max) { - unsigned int sad = - fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride); - if (sad < best_sad) { - const MV mv = { r, c }; - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - ++check_here; - ++c; - } - } - - return best_sad; -} - int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, const vp9_variance_fn_ptr_t *fn_ptr, diff --git a/libvpx/vp9/encoder/vp9_noise_estimate.c b/libvpx/vp9/encoder/vp9_noise_estimate.c index e2239b44b..276a0c785 100644 --- a/libvpx/vp9/encoder/vp9_noise_estimate.c +++ b/libvpx/vp9/encoder/vp9_noise_estimate.c @@ -21,6 +21,15 @@ #include "vp9/encoder/vp9_noise_estimate.h" #include "vp9/encoder/vp9_encoder.h" +#if CONFIG_VP9_TEMPORAL_DENOISING +// For SVC: only do noise estimation on top spatial layer. +static INLINE int noise_est_svc(const struct VP9_COMP *const cpi) { + return (!cpi->use_svc || + (cpi->use_svc && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)); +} +#endif + void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) { ne->enabled = 0; ne->level = kLowLow; @@ -34,7 +43,7 @@ void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) { } else if (width * height >= 1280 * 720) { ne->thresh = 140; } else if (width * height >= 640 * 360) { - ne->thresh = 100; + ne->thresh = 115; } ne->num_frames_estimate = 15; } @@ -45,7 +54,7 @@ static int enable_noise_estimation(VP9_COMP *const cpi) { #endif // Enable noise estimation if denoising is on. #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) && cpi->common.width >= 320 && cpi->common.height >= 180) return 1; #endif @@ -56,8 +65,8 @@ static int enable_noise_estimation(VP9_COMP *const cpi) { if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 && cpi->resize_state == ORIG && cpi->resize_pending == 0 && !cpi->use_svc && - cpi->oxcf.content != VP9E_CONTENT_SCREEN && cpi->common.width >= 640 && - cpi->common.height >= 360) + cpi->oxcf.content != VP9E_CONTENT_SCREEN && + cpi->common.width * cpi->common.height >= 640 * 360) return 1; else return 0; @@ -111,7 +120,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { // Estimate is between current source and last source. YV12_BUFFER_CONFIG *last_source = cpi->Last_Source; #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi)) { + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) { last_source = &cpi->denoiser.last_source; // Tune these thresholds for different resolutions when denoising is // enabled. @@ -131,7 +140,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { (cpi->svc.number_spatial_layers == 1 && (ne->last_w != cm->width || ne->last_h != cm->height))) { #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi)) + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) copy_frame(&cpi->denoiser.last_source, cpi->Source); #endif if (last_source != NULL) { @@ -146,7 +155,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { ne->count = 0; ne->num_frames_estimate = 10; #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) && cpi->svc.current_superframe > 1) { vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level); copy_frame(&cpi->denoiser.last_source, cpi->Source); @@ -190,44 +199,42 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { int bl_index1 = bl_index + 1; int bl_index2 = bl_index + cm->mi_cols; int bl_index3 = bl_index2 + 1; - // Only consider blocks that are likely steady background. i.e, have - // been encoded as zero/low motion x (= thresh_consec_zeromv) frames - // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all - // 4 sub-blocks for 16x16 block. Also, avoid skin blocks. int consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index], VPXMIN(cpi->consec_zero_mv[bl_index1], VPXMIN(cpi->consec_zero_mv[bl_index2], cpi->consec_zero_mv[bl_index3]))); - int is_skin = 0; - if (cpi->use_skin_detection) { - is_skin = - vp9_compute_skin_block(src_y, src_u, src_v, src_ystride, - src_uvstride, bsize, consec_zeromv, 0); - } - if (frame_low_motion && - cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv && - cpi->consec_zero_mv[bl_index1] > thresh_consec_zeromv && - cpi->consec_zero_mv[bl_index2] > thresh_consec_zeromv && - cpi->consec_zero_mv[bl_index3] > thresh_consec_zeromv && - !is_skin) { - // Compute variance. - unsigned int sse; - unsigned int variance = cpi->fn_ptr[bsize].vf( - src_y, src_ystride, last_src_y, last_src_ystride, &sse); - // Only consider this block as valid for noise measurement if the - // average term (sse - variance = N * avg^{2}, N = 16X16) of the - // temporal residual is small (avoid effects from lighting change). - if ((sse - variance) < thresh_sum_diff) { - unsigned int sse2; - const unsigned int spatial_variance = cpi->fn_ptr[bsize].vf( - src_y, src_ystride, const_source, 0, &sse2); - // Avoid blocks with high brightness and high spatial variance. - if ((sse2 - spatial_variance) < thresh_sum_spatial && - spatial_variance < thresh_spatial_var) { - avg_est += low_res ? variance >> 4 - : variance / ((spatial_variance >> 9) + 1); - num_samples++; + // Only consider blocks that are likely steady background. i.e, have + // been encoded as zero/low motion x (= thresh_consec_zeromv) frames + // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all + // 4 sub-blocks for 16x16 block. Also, avoid skin blocks. + if (frame_low_motion && consec_zeromv > thresh_consec_zeromv) { + int is_skin = 0; + if (cpi->use_skin_detection) { + is_skin = + vp9_compute_skin_block(src_y, src_u, src_v, src_ystride, + src_uvstride, bsize, consec_zeromv, 0); + } + if (!is_skin) { + unsigned int sse; + // Compute variance. + unsigned int variance = cpi->fn_ptr[bsize].vf( + src_y, src_ystride, last_src_y, last_src_ystride, &sse); + // Only consider this block as valid for noise measurement if the + // average term (sse - variance = N * avg^{2}, N = 16X16) of the + // temporal residual is small (avoid effects from lighting + // change). + if ((sse - variance) < thresh_sum_diff) { + unsigned int sse2; + const unsigned int spatial_variance = cpi->fn_ptr[bsize].vf( + src_y, src_ystride, const_source, 0, &sse2); + // Avoid blocks with high brightness and high spatial variance. + if ((sse2 - spatial_variance) < thresh_sum_spatial && + spatial_variance < thresh_spatial_var) { + avg_est += low_res ? variance >> 4 + : variance / ((spatial_variance >> 9) + 1); + num_samples++; + } } } } @@ -259,14 +266,14 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { ne->count = 0; ne->level = vp9_noise_estimate_extract_level(ne); #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi)) + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level); #endif } } } #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi)) + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) copy_frame(&cpi->denoiser.last_source, cpi->Source); #endif } diff --git a/libvpx/vp9/encoder/vp9_pickmode.c b/libvpx/vp9/encoder/vp9_pickmode.c index b05f4184b..f2f323a28 100644 --- a/libvpx/vp9/encoder/vp9_pickmode.c +++ b/libvpx/vp9/encoder/vp9_pickmode.c @@ -158,6 +158,7 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, const MvLimits tmp_mv_limits = x->mv_limits; int rv = 0; int cost_list[5]; + int search_subpel = 1; const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref); if (scaled_ref_frame) { @@ -192,9 +193,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, else center_mv = tmp_mv->as_mv; - vp9_full_pixel_search( - cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb, - cond_cost_list(cpi, cost_list), ¢er_mv, &tmp_mv->as_mv, INT_MAX, 0); + if (x->sb_use_mv_part) { + tmp_mv->as_mv.row = x->sb_mvrow_part >> 3; + tmp_mv->as_mv.col = x->sb_mvcol_part >> 3; + } else { + vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb, + cond_cost_list(cpi, cost_list), ¢er_mv, &tmp_mv->as_mv, INT_MAX, 0); + } x->mv_limits = tmp_mv_limits; @@ -210,8 +216,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, rv = !(RDCOST(x->rdmult, x->rddiv, (*rate_mv + rate_mode), 0) > best_rd_sofar); - if (rv) { - const int subpel_force_stop = cpi->sf.mv.subpel_force_stop; + // For SVC on non-reference frame, avoid subpel for (0, 0) motion. + if (cpi->use_svc && cpi->svc.non_reference_frame) { + if (mvp_full.row == 0 && mvp_full.col == 0) search_subpel = 0; + } + + if (rv && search_subpel) { + int subpel_force_stop = cpi->sf.mv.subpel_force_stop; + if (use_base_mv && cpi->sf.base_mv_aggressive) subpel_force_stop = 2; cpi->find_fractional_mv_step( x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], subpel_force_stop, @@ -318,7 +330,8 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum, unsigned int *var_y, unsigned int *sse_y, - int mi_row, int mi_col, int *early_term) { + int mi_row, int mi_col, int *early_term, + int *flag_preduv_computed) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. @@ -475,6 +488,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, int j = i - 1; vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i); + flag_preduv_computed[i - 1] = 1; var_uv[j] = cpi->fn_ptr[uv_bsize].vf( p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]); @@ -664,7 +678,9 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, #endif if (cpi->sf.use_simple_block_yrd && cpi->common.frame_type != KEY_FRAME && - bsize < BLOCK_32X32) { + (bsize < BLOCK_32X32 || + (cpi->use_svc && + (bsize < BLOCK_32X32 || cpi->svc.temporal_layer_id > 0)))) { unsigned int var_y, sse_y; (void)tx_size; if (!rd_computed) @@ -711,7 +727,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, scan_order->iscan); break; case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); @@ -846,13 +862,11 @@ static void free_pred_buffer(PRED_BUFFER *p) { if (p != NULL) p->in_use = 0; } -static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - int mi_row, int mi_col, - MV_REFERENCE_FRAME ref_frame, - PREDICTION_MODE this_mode, unsigned int var_y, - unsigned int sse_y, - struct buf_2d yv12_mb[][MAX_MB_PLANE], - int *rate, int64_t *dist) { +static void encode_breakout_test( + VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, + MV_REFERENCE_FRAME ref_frame, PREDICTION_MODE this_mode, unsigned int var_y, + unsigned int sse_y, struct buf_2d yv12_mb[][MAX_MB_PLANE], int *rate, + int64_t *dist, int *flag_preduv_computed) { MACROBLOCKD *xd = &x->e_mbd; MODE_INFO *const mi = xd->mi[0]; const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]); @@ -862,6 +876,7 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, // Skipping threshold for dc. unsigned int thresh_dc; int motion_low = 1; + if (cpi->use_svc && ref_frame == GOLDEN_FRAME) return; if (mi->mv[0].as_mv.row > 64 || mi->mv[0].as_mv.row < -64 || mi->mv[0].as_mv.col > 64 || mi->mv[0].as_mv.col < -64) motion_low = 0; @@ -912,9 +927,7 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, thresh_dc_uv = 0; } - // Skip UV prediction unless breakout is zero (lossless) to save - // computation with low impact on the result - if (x->encode_breakout == 0) { + if (!flag_preduv_computed[0] || !flag_preduv_computed[1]) { xd->plane[1].pre[0] = yv12_mb[ref_frame][1]; xd->plane[2].pre[0] = yv12_mb[ref_frame][2]; vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize); @@ -1163,33 +1176,22 @@ static const REF_MODE ref_mode_set[RT_INTER_MODES] = { { ALTREF_FRAME, ZEROMV }, { ALTREF_FRAME, NEARESTMV }, { ALTREF_FRAME, NEARMV }, { ALTREF_FRAME, NEWMV } }; -static const REF_MODE ref_mode_set_svc[RT_INTER_MODES] = { + +#define RT_INTER_MODES_SVC 8 +static const REF_MODE ref_mode_set_svc[RT_INTER_MODES_SVC] = { { LAST_FRAME, ZEROMV }, { LAST_FRAME, NEARESTMV }, { LAST_FRAME, NEARMV }, { GOLDEN_FRAME, ZEROMV }, { GOLDEN_FRAME, NEARESTMV }, { GOLDEN_FRAME, NEARMV }, { LAST_FRAME, NEWMV }, { GOLDEN_FRAME, NEWMV } }; -static int set_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize) { - const VP9_COMMON *const cm = &cpi->common; - // Reduce the intra cost penalty for small blocks (<=16x16). - int reduction_fac = - (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0; - if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh) - // Don't reduce intra cost penalty if estimated noise level is high. - reduction_fac = 0; - return vp9_get_intra_cost_penalty(cm->base_qindex, cm->y_dc_delta_q, - cm->bit_depth) >> - reduction_fac; -} - static INLINE void find_predictors( VP9_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int const_motion[MAX_REF_FRAMES], int *ref_frame_skip_mask, const int flag_list[4], TileDataEnc *tile_data, int mi_row, int mi_col, struct buf_2d yv12_mb[4][MAX_MB_PLANE], BLOCK_SIZE bsize, - int force_skip_low_temp_var) { + int force_skip_low_temp_var, int comp_pred_allowed) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); @@ -1203,7 +1205,7 @@ static INLINE void find_predictors( int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame]; const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); - if (cm->use_prev_frame_mvs) { + if (cm->use_prev_frame_mvs || comp_pred_allowed) { vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col, x->mbmi_ext->mode_context); } else { @@ -1425,10 +1427,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, struct macroblockd_plane *const pd = &xd->plane[0]; PREDICTION_MODE best_mode = ZEROMV; MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME; - MV_REFERENCE_FRAME usable_ref_frame; + MV_REFERENCE_FRAME usable_ref_frame, second_ref_frame; TX_SIZE best_tx_size = TX_SIZES; INTERP_FILTER best_pred_filter = EIGHTTAP; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; + uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES]; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG }; @@ -1437,7 +1440,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // var_y and sse_y are saved to be used in skipping checking unsigned int var_y = UINT_MAX; unsigned int sse_y = UINT_MAX; - const int intra_cost_penalty = set_intra_cost_penalty(cpi, bsize); + const int intra_cost_penalty = + vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q); int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv, intra_cost_penalty, 0); const int *const rd_threshes = cpi->rd.threshes[mi->segment_id][bsize]; @@ -1483,6 +1487,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int force_skip_low_temp_var = 0; int skip_ref_find_pred[4] = { 0 }; unsigned int sse_zeromv_normalized = UINT_MAX; + unsigned int best_sse_sofar = UINT_MAX; unsigned int thresh_svc_skip_golden = 500; #if CONFIG_VP9_TEMPORAL_DENOISING VP9_PICKMODE_CTX_DEN ctx_den; @@ -1490,9 +1495,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int denoise_svc_pickmode = 1; #endif INTERP_FILTER filter_gf_svc = EIGHTTAP; + MV_REFERENCE_FRAME best_second_ref_frame = NONE; + int comp_modes = 0; + int num_inter_modes = (cpi->use_svc) ? RT_INTER_MODES_SVC : RT_INTER_MODES; + int flag_svc_subpel = 0; + int svc_mv_col = 0; + int svc_mv_row = 0; init_ref_frame_cost(cm, xd, ref_frame_cost); + memset(&mode_checked[0][0], 0, MB_MODE_COUNT * MAX_REF_FRAMES); + if (reuse_inter_pred) { int i; for (i = 0; i < 3; i++) { @@ -1561,7 +1574,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } #endif - if (cpi->rc.frames_since_golden == 0 && !cpi->use_svc) { + if (cpi->rc.frames_since_golden == 0 && !cpi->use_svc && + !cpi->rc.alt_ref_gf_group && !cpi->rc.last_frame_is_src_altref) { usable_ref_frame = LAST_FRAME; } else { usable_ref_frame = GOLDEN_FRAME; @@ -1575,6 +1589,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, skip_ref_find_pred[LAST_FRAME] = 1; skip_ref_find_pred[GOLDEN_FRAME] = 1; } + if (!cm->show_frame) { + if (cpi->rc.frames_since_key == 1) { + usable_ref_frame = LAST_FRAME; + skip_ref_find_pred[GOLDEN_FRAME] = 1; + skip_ref_find_pred[ALTREF_FRAME] = 1; + } + } } // For svc mode, on spatial_layer_id > 0: if the reference has different scale @@ -1609,18 +1630,39 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (cpi->oxcf.speed >= 8 && !cpi->use_svc && ((cpi->rc.frames_since_golden + 1) < x->last_sb_high_content || - x->last_sb_high_content > 40)) + x->last_sb_high_content > 40 || cpi->rc.frames_since_golden > 120)) usable_ref_frame = LAST_FRAME; + // Compound prediction modes: (0,0) on LAST/GOLDEN and ARF. + if (cm->reference_mode == REFERENCE_MODE_SELECT && + cpi->sf.use_compound_nonrd_pickmode && usable_ref_frame == ALTREF_FRAME) + comp_modes = 2; + for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) { if (!skip_ref_find_pred[ref_frame]) { find_predictors(cpi, x, ref_frame, frame_mv, const_motion, &ref_frame_skip_mask, flag_list, tile_data, mi_row, - mi_col, yv12_mb, bsize, force_skip_low_temp_var); + mi_col, yv12_mb, bsize, force_skip_low_temp_var, + comp_modes > 0); } } - for (idx = 0; idx < RT_INTER_MODES; ++idx) { + if (cpi->use_svc || cpi->oxcf.speed <= 7 || bsize < BLOCK_32X32) + x->sb_use_mv_part = 0; + + // Set the flag_svc_subpel to 1 for SVC if the lower spatial layer used + // an averaging filter for downsampling (phase = 8). If so, we will test + // a nonzero motion mode on the spatial (goldeen) reference. + // The nonzero motion is half pixel shifted to left and top (-4, -4). + if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 && + svc_force_zero_mode[GOLDEN_FRAME - 1] && + cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id - 1] == 8) { + svc_mv_col = -4; + svc_mv_row = -4; + flag_svc_subpel = 1; + } + + for (idx = 0; idx < num_inter_modes + comp_modes; ++idx) { int rate_mv = 0; int mode_rd_thresh; int mode_index; @@ -1629,17 +1671,56 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int is_skippable; int this_early_term = 0; int rd_computed = 0; + int flag_preduv_computed[2] = { 0 }; + int inter_mv_mode = 0; + int skip_this_mv = 0; + int comp_pred = 0; + int force_gf_mv = 0; + PREDICTION_MODE this_mode; + second_ref_frame = NONE; + + if (idx < num_inter_modes) { + this_mode = ref_mode_set[idx].pred_mode; + ref_frame = ref_mode_set[idx].ref_frame; + + if (cpi->use_svc) { + this_mode = ref_mode_set_svc[idx].pred_mode; + ref_frame = ref_mode_set_svc[idx].ref_frame; + } + } else { + // Add (0,0) compound modes. + this_mode = ZEROMV; + ref_frame = LAST_FRAME; + if (idx == num_inter_modes + comp_modes - 1) ref_frame = GOLDEN_FRAME; + second_ref_frame = ALTREF_FRAME; + comp_pred = 1; + } - PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode; + if (ref_frame > usable_ref_frame) continue; + if (skip_ref_find_pred[ref_frame]) continue; - ref_frame = ref_mode_set[idx].ref_frame; + if (flag_svc_subpel && ref_frame == GOLDEN_FRAME) { + force_gf_mv = 1; + // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row), + // otherwise set NEWMV to (svc_mv_col, svc_mv_row). + if (this_mode == NEWMV) { + frame_mv[this_mode][ref_frame].as_mv.col = svc_mv_col; + frame_mv[this_mode][ref_frame].as_mv.row = svc_mv_row; + } else if (frame_mv[this_mode][ref_frame].as_mv.col != svc_mv_col || + frame_mv[this_mode][ref_frame].as_mv.row != svc_mv_row) { + continue; + } + } - if (cpi->use_svc) { - this_mode = ref_mode_set_svc[idx].pred_mode; - ref_frame = ref_mode_set_svc[idx].ref_frame; + if (comp_pred) { + const struct segmentation *const seg = &cm->seg; + if (!cpi->allow_comp_inter_inter) continue; + // Skip compound inter modes if ARF is not available. + if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; + // Do not allow compound prediction if the segment level reference frame + // feature is in use as in this case there can only be one reference. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue; } - if (ref_frame > usable_ref_frame) continue; - if (skip_ref_find_pred[ref_frame]) continue; // For SVC, skip the golden (spatial) reference search if sse of zeromv_last // is below threshold. @@ -1660,13 +1741,18 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, frame_mv[this_mode][ref_frame].as_int != 0)) continue; - if (cpi->rc.alt_ref_gf_group && + if (!cm->show_frame && ref_frame == ALTREF_FRAME && + frame_mv[this_mode][ref_frame].as_int != 0) + continue; + + if (cpi->rc.alt_ref_gf_group && cm->show_frame && cpi->rc.frames_since_golden > (cpi->rc.baseline_gf_interval >> 1) && ref_frame == GOLDEN_FRAME && frame_mv[this_mode][ref_frame].as_int != 0) continue; - if (cpi->rc.alt_ref_gf_group && + if (cpi->rc.alt_ref_gf_group && cm->show_frame && + cpi->rc.frames_since_golden > 0 && cpi->rc.frames_since_golden < (cpi->rc.baseline_gf_interval >> 1) && ref_frame == ALTREF_FRAME && frame_mv[this_mode][ref_frame].as_int != 0) @@ -1680,12 +1766,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped // later. - if (force_skip_low_temp_var && ref_frame == GOLDEN_FRAME && + if (!force_gf_mv && force_skip_low_temp_var && ref_frame == GOLDEN_FRAME && frame_mv[this_mode][ref_frame].as_int != 0) { continue; } - if ((cpi->sf.short_circuit_low_temp_var >= 2 || + if (x->content_state_sb != kVeryHighSad && + (cpi->sf.short_circuit_low_temp_var >= 2 || (cpi->sf.short_circuit_low_temp_var == 1 && bsize == BLOCK_64X64)) && force_skip_low_temp_var && ref_frame == LAST_FRAME && this_mode == NEWMV) { @@ -1693,7 +1780,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } if (cpi->use_svc) { - if (svc_force_zero_mode[ref_frame - 1] && + if (!force_gf_mv && svc_force_zero_mode[ref_frame - 1] && frame_mv[this_mode][ref_frame].as_int != 0) continue; } @@ -1723,11 +1810,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (ref_frame_skip_mask & (1 << ref_frame)) continue; // Select prediction reference frames. - for (i = 0; i < MAX_MB_PLANE; i++) + for (i = 0; i < MAX_MB_PLANE; i++) { xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; + if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; + } mi->ref_frame[0] = ref_frame; - set_ref_ptrs(cm, xd, ref_frame, NONE); + mi->ref_frame[1] = second_ref_frame; + set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)]; mode_rd_thresh = best_mode_skip_txfm ? rd_threshes[mode_index] << 1 @@ -1747,12 +1837,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, &rd_thresh_freq_fact[mode_index]))) continue; - if (this_mode == NEWMV) { + if (this_mode == NEWMV && !force_gf_mv) { if (ref_frame > LAST_FRAME && !cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR) { int tmp_sad; uint32_t dis; - int cost_list[5]; + int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX }; if (bsize < BLOCK_16X16) continue; @@ -1780,17 +1870,37 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } else if (svc->use_base_mv && svc->spatial_layer_id) { if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) { const int pre_stride = xd->plane[0].pre[0].stride; - int base_mv_sad = INT_MAX; - const float base_mv_bias = sf->base_mv_aggressive ? 1.5f : 1.0f; + unsigned int base_mv_sse = UINT_MAX; + int scale = (cpi->rc.avg_frame_low_motion > 60) ? 2 : 4; const uint8_t *const pre_buf = xd->plane[0].pre[0].buf + (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride + (frame_mv[NEWMV][ref_frame].as_mv.col >> 3); - base_mv_sad = cpi->fn_ptr[bsize].sdf( - x->plane[0].src.buf, x->plane[0].src.stride, pre_buf, pre_stride); + cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride, + pre_buf, pre_stride, &base_mv_sse); + + // Exit NEWMV search if base_mv is (0,0) && bsize < BLOCK_16x16, + // for SVC encoding. + if (cpi->use_svc && cpi->svc.use_base_mv && bsize < BLOCK_16X16 && + frame_mv[NEWMV][ref_frame].as_mv.row == 0 && + frame_mv[NEWMV][ref_frame].as_mv.col == 0) + continue; - if (base_mv_sad < (int)(base_mv_bias * x->pred_mv_sad[ref_frame])) { + // Exit NEWMV search if base_mv_sse is large. + if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale)) + continue; + if (base_mv_sse < (best_sse_sofar << 1)) { // Base layer mv is good. + // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since + // (0, 0) mode is already tested. + unsigned int base_mv_sse_normalized = + base_mv_sse >> + (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + if (sf->base_mv_aggressive && base_mv_sse <= best_sse_sofar && + base_mv_sse_normalized < 400 && + frame_mv[NEWMV][ref_frame].as_mv.row == 0 && + frame_mv[NEWMV][ref_frame].as_mv.col == 0) + continue; if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, &frame_mv[NEWMV][ref_frame], &rate_mv, best_rdc.rdcost, 1)) { @@ -1813,6 +1923,22 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } } + // TODO(jianj): Skipping the testing of (duplicate) non-zero motion vector + // causes some regression, leave it for duplicate zero-mv for now, until + // regression issue is resolved. + for (inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV; inter_mv_mode++) { + if (inter_mv_mode == this_mode || comp_pred) continue; + if (mode_checked[inter_mv_mode][ref_frame] && + frame_mv[this_mode][ref_frame].as_int == + frame_mv[inter_mv_mode][ref_frame].as_int && + frame_mv[inter_mv_mode][ref_frame].as_int == 0) { + skip_this_mv = 1; + break; + } + } + + if (skip_this_mv) continue; + // If use_golden_nonzeromv is false, NEWMV mode is skipped for golden, no // need to compute best_pred_sad which is only used to skip golden NEWMV. if (use_golden_nonzeromv && this_mode == NEWMV && ref_frame == LAST_FRAME && @@ -1827,13 +1953,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, x->pred_mv_sad[LAST_FRAME] = best_pred_sad; } - if (this_mode != NEARESTMV && + if (this_mode != NEARESTMV && !comp_pred && frame_mv[this_mode][ref_frame].as_int == frame_mv[NEARESTMV][ref_frame].as_int) continue; mi->mode = this_mode; mi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int; + mi->mv[1].as_int = 0; // Search for the best prediction filter type, when the resulting // motion vector is at sub-pixel accuracy level for luma component, i.e., @@ -1851,7 +1978,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if ((this_mode == NEWMV || filter_ref == SWITCHABLE) && pred_filter_search && (ref_frame == LAST_FRAME || - (ref_frame == GOLDEN_FRAME && + (ref_frame == GOLDEN_FRAME && !force_gf_mv && (cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) && (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) { int pf_rate[3]; @@ -1907,9 +2034,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, pd->dst.stride = this_mode_pred->stride; } } else { - const int large_block = (x->sb_is_skin || cpi->oxcf.speed < 7) - ? bsize > BLOCK_32X32 - : bsize >= BLOCK_32X32; + // For low motion content use x->sb_is_skin in addition to VeryHighSad + // for setting large_block. + const int large_block = + (x->content_state_sb == kVeryHighSad || + (x->sb_is_skin && cpi->rc.avg_frame_low_motion > 70) || + cpi->oxcf.speed < 7) + ? bsize > BLOCK_32X32 + : bsize >= BLOCK_32X32; mi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref; if (cpi->use_svc && ref_frame == GOLDEN_FRAME && @@ -1924,7 +2056,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, cm->base_qindex) { model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, &var_y, &sse_y, mi_row, mi_col, - &this_early_term); + &this_early_term, flag_preduv_computed); } else { rd_computed = 1; model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, @@ -1936,6 +2068,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, sse_zeromv_normalized = sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); } + if (sse_y < best_sse_sofar) best_sse_sofar = sse_y; } if (!this_early_term) { @@ -1968,13 +2101,18 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); } - if (x->color_sensitivity[0] || x->color_sensitivity[1]) { + if (!this_early_term && + (x->color_sensitivity[0] || x->color_sensitivity[1])) { RD_COST rdc_uv; const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, &xd->plane[1]); - if (x->color_sensitivity[0]) + if (x->color_sensitivity[0] && !flag_preduv_computed[0]) { vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1); - if (x->color_sensitivity[1]) + flag_preduv_computed[0] = 1; + } + if (x->color_sensitivity[1] && !flag_preduv_computed[1]) { vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2); + flag_preduv_computed[1] = 1; + } model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &var_y, &sse_y, 1, 2); this_rdc.rate += rdc_uv.rate; this_rdc.dist += rdc_uv.dist; @@ -1983,6 +2121,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, this_rdc.rate += rate_mv; this_rdc.rate += cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]] [INTER_OFFSET(this_mode)]; + // TODO(marpan): Add costing for compound mode. this_rdc.rate += ref_frame_cost[ref_frame]; this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist); @@ -2002,7 +2141,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (cpi->allow_encode_breakout) { encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, this_mode, var_y, sse_y, yv12_mb, &this_rdc.rate, - &this_rdc.dist); + &this_rdc.dist, flag_preduv_computed); if (x->skip) { this_rdc.rate += rate_mv; this_rdc.rdcost = @@ -2022,6 +2161,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, (void)ctx; #endif + mode_checked[this_mode][ref_frame] = 1; + if (this_rdc.rdcost < best_rdc.rdcost || x->skip) { best_rdc = this_rdc; best_mode = this_mode; @@ -2030,6 +2171,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, best_ref_frame = ref_frame; best_mode_skip_txfm = x->skip_txfm[0]; best_early_term = this_early_term; + best_second_ref_frame = second_ref_frame; if (reuse_inter_pred) { free_pred_buffer(best_pred); @@ -2056,6 +2198,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, mi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int; xd->mi[0]->bmi[0].as_mv[0].as_int = mi->mv[0].as_int; x->skip_txfm[0] = best_mode_skip_txfm; + mi->ref_frame[1] = best_second_ref_frame; // For spatial enhancemanent layer: perform intra prediction only if base // layer is chosen as the reference. Always perform intra prediction if @@ -2074,7 +2217,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // Perform intra prediction search, if the best SAD is above a certain // threshold. if (best_rdc.rdcost == INT64_MAX || - ((!force_skip_low_temp_var || bsize < BLOCK_32X32) && + ((!force_skip_low_temp_var || bsize < BLOCK_32X32 || + x->content_state_sb == kVeryHighSad) && perform_intra_pred && !x->skip && best_rdc.rdcost > inter_mode_thresh && bsize <= cpi->sf.max_intra_bsize && !x->skip_low_source_sad && !x->lowvar_highsumdiff)) { @@ -2095,15 +2239,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, vpx_highbd_convolve_copy( CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride, CONVERT_TO_SHORTPTR(this_mode_pred->data), this_mode_pred->stride, - NULL, 0, NULL, 0, bw, bh, xd->bd); + NULL, 0, 0, 0, 0, bw, bh, xd->bd); else vpx_convolve_copy(best_pred->data, best_pred->stride, this_mode_pred->data, this_mode_pred->stride, NULL, - 0, NULL, 0, bw, bh); + 0, 0, 0, 0, bw, bh); #else vpx_convolve_copy(best_pred->data, best_pred->stride, this_mode_pred->data, this_mode_pred->stride, NULL, 0, - NULL, 0, bw, bh); + 0, 0, 0, bw, bh); #endif // CONFIG_VP9_HIGHBITDEPTH best_pred = this_mode_pred; } @@ -2168,8 +2312,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, best_mode = this_mode; best_intra_tx_size = mi->tx_size; best_ref_frame = INTRA_FRAME; + best_second_ref_frame = NONE; mi->uv_mode = this_mode; mi->mv[0].as_int = INVALID_MV; + mi->mv[1].as_int = INVALID_MV; best_mode_skip_txfm = x->skip_txfm[0]; } } @@ -2185,6 +2331,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, pd->dst = orig_dst; mi->mode = best_mode; mi->ref_frame[0] = best_ref_frame; + mi->ref_frame[1] = best_second_ref_frame; x->skip_txfm[0] = best_mode_skip_txfm; if (!is_inter_block(mi)) { @@ -2197,14 +2344,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (cm->use_highbitdepth) vpx_highbd_convolve_copy( CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride, - CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, NULL, 0, + CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh, xd->bd); else vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf, - pd->dst.stride, NULL, 0, NULL, 0, bw, bh); + pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh); #else vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf, - pd->dst.stride, NULL, 0, NULL, 0, bw, bh); + pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh); #endif // CONFIG_VP9_HIGHBITDEPTH } } @@ -2214,6 +2361,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow && cpi->denoiser.reset == 0) { VP9_DENOISER_DECISION decision = COPY_BLOCK; + ctx->sb_skip_denoising = 0; + // TODO(marpan): There is an issue with denoising when the + // superblock partitioning scheme is based on the pickmode. + // Remove this condition when the issue is resolved. + if (x->sb_pickmode_part) ctx->sb_skip_denoising = 1; vp9_pickmode_ctx_den_update(&ctx_den, zero_last_cost_orig, ref_frame_cost, frame_mv, reuse_inter_pred, best_tx_size, best_mode, best_ref_frame, best_pred_filter, @@ -2225,6 +2377,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } #endif + if (best_ref_frame == ALTREF_FRAME || best_second_ref_frame == ALTREF_FRAME) + x->arf_frame_usage++; + else if (best_ref_frame != INTRA_FRAME) + x->lastgolden_frame_usage++; + if (cpi->sf.adaptive_rd_thresh) { THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mi->mode)]; diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c index f2a59a4af..09f61ead2 100644 --- a/libvpx/vp9/encoder/vp9_quantize.c +++ b/libvpx/vp9/encoder/vp9_quantize.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <assert.h> #include <math.h> #include "./vpx_dsp_rtcd.h" #include "vpx_mem/vpx_mem.h" @@ -28,33 +29,33 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *iscan) { int i, eob = -1; (void)iscan; + (void)skip_block; + assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); - tmp = (tmp * quant_ptr[rc != 0]) >> 16; + int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant_ptr[rc != 0]) >> 16; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; - if (tmp) eob = i; - } + if (tmp) eob = i; } *eob_ptr = eob + 1; } #if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count, +void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, @@ -64,24 +65,24 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count, int eob = -1; (void)iscan; + (void)skip_block; + assert(!skip_block); - memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < count; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp = abs_coeff + round_ptr[rc != 0]; - const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 16); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; - if (abs_qcoeff) eob = i; - } + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + round_ptr[rc != 0]; + const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 16); + qcoeff_ptr[rc] = (tran_low_t)(abs_qcoeff ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + if (abs_qcoeff) eob = i; } *eob_ptr = eob + 1; } @@ -97,28 +98,28 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *scan, const int16_t *iscan) { int i, eob = -1; (void)iscan; + (void)skip_block; + assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - int tmp = 0; - int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - - if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) { - abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); - tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - } - - if (tmp) eob = i; + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + int tmp = 0; + int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) { + abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); + tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; } + + if (tmp) eob = i; } *eob_ptr = eob + 1; } @@ -132,28 +133,27 @@ void vp9_highbd_quantize_fp_32x32_c( int i, eob = -1; (void)iscan; + (void)skip_block; + assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - for (i = 0; i < n_coeffs; i++) { - uint32_t abs_qcoeff = 0; - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - - if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) { - const int64_t tmp = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - } - - if (abs_qcoeff) eob = i; + for (i = 0; i < n_coeffs; i++) { + int abs_qcoeff = 0; + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) { + const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; } + + if (abs_qcoeff) eob = i; } *eob_ptr = eob + 1; } @@ -164,22 +164,28 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, MACROBLOCKD *const xd = &x->e_mbd; struct macroblock_plane *p = &x->plane[plane]; struct macroblockd_plane *pd = &xd->plane[plane]; + tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block), + *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + const int n_coeffs = 4 * 4; + + if (x->skip_block) { + memset(qcoeff, 0, n_coeffs * sizeof(*qcoeff)); + memset(dqcoeff, 0, n_coeffs * sizeof(*dqcoeff)); + return; + } #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), 16, x->skip_block, - p->zbin, p->round, p->quant, p->quant_shift, - BLOCK_OFFSET(p->qcoeff, block), - BLOCK_OFFSET(pd->dqcoeff, block), pd->dequant, + vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, + x->skip_block, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, &p->eobs[block], scan, iscan); return; } #endif - vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), 16, x->skip_block, p->zbin, - p->round, p->quant, p->quant_shift, - BLOCK_OFFSET(p->qcoeff, block), - BLOCK_OFFSET(pd->dqcoeff, block), pd->dequant, &p->eobs[block], - scan, iscan); + vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, x->skip_block, + p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, &p->eobs[block], scan, iscan); } static void invert_quant(int16_t *quant, int16_t *shift, int d) { diff --git a/libvpx/vp9/encoder/vp9_ratectrl.c b/libvpx/vp9/encoder/vp9_ratectrl.c index 27fea5d4e..b7f3a0e89 100644 --- a/libvpx/vp9/encoder/vp9_ratectrl.c +++ b/libvpx/vp9/encoder/vp9_ratectrl.c @@ -44,11 +44,6 @@ #define MIN_BPB_FACTOR 0.005 #define MAX_BPB_FACTOR 50 -#define FRAME_OVERHEAD_BITS 200 - -// Use this macro to turn on/off use of alt-refs in one-pass vbr mode. -#define USE_ALTREF_FOR_ONE_PASS 0 - #if CONFIG_VP9_HIGHBITDEPTH #define ASSIGN_MINQ_TABLE(bit_depth, name) \ do { \ @@ -209,24 +204,29 @@ int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs, const int bpm = (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth)); return VPXMAX(FRAME_OVERHEAD_BITS, - (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS); + (int)(((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS)); } int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) { const RATE_CONTROL *rc = &cpi->rc; const VP9EncoderConfig *oxcf = &cpi->oxcf; - const int min_frame_target = - VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5); - if (target < min_frame_target) target = min_frame_target; - if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) { - // If there is an active ARF at this location use the minimum - // bits on this frame even if it is a constructed arf. - // The active maximum quantizer insures that an appropriate - // number of bits will be spent if needed for constructed ARFs. - target = min_frame_target; + + if (cpi->oxcf.pass != 2) { + const int min_frame_target = + VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5); + if (target < min_frame_target) target = min_frame_target; + if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) { + // If there is an active ARF at this location use the minimum + // bits on this frame even if it is a constructed arf. + // The active maximum quantizer insures that an appropriate + // number of bits will be spent if needed for constructed ARFs. + target = min_frame_target; + } } + // Clip the frame target to the maximum allowed value. if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth; + if (oxcf->rc_max_inter_bitrate_pct) { const int max_rate = rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100; @@ -353,8 +353,10 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { rc->af_ratio_onepass_vbr = 10; rc->prev_avg_source_sad_lag = 0; rc->high_source_sad = 0; + rc->reset_high_source_sad = 0; rc->high_source_sad_lagindex = -1; rc->alt_ref_gf_group = 0; + rc->last_frame_is_src_altref = 0; rc->fac_active_worst_inter = 150; rc->fac_active_worst_gf = 100; rc->force_qpmin = 0; @@ -585,7 +587,7 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, // In CBR mode, this makes sure q is between oscillating Qs to prevent // resonance. - if (cpi->oxcf.rc_mode == VPX_CBR && + if (cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.reset_high_source_sad && (!cpi->oxcf.gf_cbr_boost_pct || !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) && (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) && @@ -593,13 +595,6 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, q = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame), VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame)); } -#if USE_ALTREF_FOR_ONE_PASS - if (cpi->oxcf.enable_auto_arf && cpi->oxcf.pass == 0 && - cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 && - cpi->rc.is_src_frame_alt_ref && !cpi->rc.alt_ref_gf_group) { - q = VPXMIN(q, (q + cpi->rc.last_boosted_qindex) >> 1); - } -#endif return q; } @@ -679,7 +674,8 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) { int active_worst_quality; int ambient_qp; unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers; - if (cm->frame_type == KEY_FRAME) return rc->worst_quality; + if (cm->frame_type == KEY_FRAME || rc->reset_high_source_sad) + return rc->worst_quality; // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME] // for the first few frames following key frame. These are both initialized // to worst_quality and updated with (3/4, 1/4) average in postencode_update. @@ -1011,6 +1007,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, qdelta = vp9_compute_qdelta_by_rate( &cpi->rc, cm->frame_type, active_worst_quality, 1.75, cm->bit_depth); } + if (rc->high_source_sad && cpi->sf.use_altref_onepass) qdelta = 0; *top_index = active_worst_quality + qdelta; *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index; } @@ -1339,6 +1336,28 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { } } +static void update_altref_usage(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + int sum_ref_frame_usage = 0; + int arf_frame_usage = 0; + int mi_row, mi_col; + if (cpi->rc.alt_ref_gf_group && !cpi->rc.is_src_frame_alt_ref && + !cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8) { + int sboffset = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3); + sum_ref_frame_usage += cpi->count_arf_frame_usage[sboffset] + + cpi->count_lastgolden_frame_usage[sboffset]; + arf_frame_usage += cpi->count_arf_frame_usage[sboffset]; + } + } + if (sum_ref_frame_usage > 0) { + double altref_count = 100.0 * arf_frame_usage / sum_ref_frame_usage; + cpi->rc.perc_arf_usage = + 0.75 * cpi->rc.perc_arf_usage + 0.25 * altref_count; + } +} + static void compute_frame_low_motion(VP9_COMP *const cpi) { VP9_COMMON *const cm = &cpi->common; int mi_row, mi_col; @@ -1462,8 +1481,15 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { } if (oxcf->pass == 0) { - if (cm->frame_type != KEY_FRAME) compute_frame_low_motion(cpi); + if (cm->frame_type != KEY_FRAME) { + compute_frame_low_motion(cpi); + if (cpi->sf.use_altref_onepass) update_altref_usage(cpi); + } + cpi->rc.last_frame_is_src_altref = cpi->rc.is_src_frame_alt_ref; } + if (cm->frame_type != KEY_FRAME) rc->reset_high_source_sad = 0; + + rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth; } void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) { @@ -1556,8 +1582,9 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { // Adjust boost and af_ratio based on avg_frame_low_motion, which varies // between 0 and 100 (stationary, 100% zero/small motion). rc->gfu_boost = - VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) / - (rc->avg_frame_low_motion + 100)); + VPXMAX(500, + DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) / + (rc->avg_frame_low_motion + 100)); rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400)); } adjust_gfint_frame_constraint(cpi, rc->frames_to_key); @@ -1565,12 +1592,10 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { cpi->refresh_golden_frame = 1; rc->source_alt_ref_pending = 0; rc->alt_ref_gf_group = 0; -#if USE_ALTREF_FOR_ONE_PASS - if (cpi->oxcf.enable_auto_arf) { + if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) { rc->source_alt_ref_pending = 1; rc->alt_ref_gf_group = 1; } -#endif } if (cm->frame_type == KEY_FRAME) target = calc_iframe_target_size_one_pass_vbr(cpi); @@ -1847,6 +1872,26 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi, // Clamp min to max rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval); + + if (oxcf->target_level == LEVEL_AUTO) { + const uint32_t pic_size = cpi->common.width * cpi->common.height; + const uint32_t pic_breadth = + VPXMAX(cpi->common.width, cpi->common.height); + int i; + for (i = LEVEL_1; i < LEVEL_MAX; ++i) { + if (vp9_level_defs[i].max_luma_picture_size >= pic_size && + vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) { + if (rc->min_gf_interval <= + (int)vp9_level_defs[i].min_altref_distance) { + rc->min_gf_interval = + (int)vp9_level_defs[i].min_altref_distance + 1; + rc->max_gf_interval = + VPXMAX(rc->max_gf_interval, rc->min_gf_interval); + } + break; + } + } + } } } @@ -1933,9 +1978,11 @@ void vp9_set_target_rate(VP9_COMP *cpi) { else target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate); - // Correction to rate target based on prior over or under shoot. - if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ) - vbr_rate_correction(cpi, &target_rate); + if (!cpi->oxcf.vbr_corpus_complexity) { + // Correction to rate target based on prior over or under shoot. + if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ) + vbr_rate_correction(cpi, &target_rate); + } vp9_rc_set_frame_target(cpi, target_rate); } @@ -2070,7 +2117,8 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) { return resize_action; } -void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { +static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, + uint64_t avg_sad_current) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; int target; @@ -2081,7 +2129,7 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { uint64_t avg_source_sad_lag = avg_sad_current; int high_source_sad_lagindex = -1; int steady_sad_lagindex = -1; - uint32_t sad_thresh1 = 60000; + uint32_t sad_thresh1 = 70000; uint32_t sad_thresh2 = 120000; int low_content = 0; int high_content = 0; @@ -2185,11 +2233,16 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { rc->af_ratio_onepass_vbr = 5; rc->gfu_boost = DEFAULT_GF_BOOST >> 2; } -#if USE_ALTREF_FOR_ONE_PASS - if (cpi->oxcf.enable_auto_arf) { - // Don't use alt-ref if there is a scene cut within the group, - // or content is not low. - if ((rc->high_source_sad_lagindex > 0 && + if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) { + // Flag to disable usage of ARF based on past usage, only allow this + // disabling if current frame/group does not start with key frame or + // scene cut. Note perc_arf_usage is only computed for speed >= 5. + int arf_usage_low = + (cm->frame_type != KEY_FRAME && !rc->high_source_sad && + cpi->rc.perc_arf_usage < 15 && cpi->oxcf.speed >= 5); + // Don't use alt-ref for this group under certain conditions. + if (arf_usage_low || + (rc->high_source_sad_lagindex > 0 && rc->high_source_sad_lagindex <= rc->frames_till_gf_update_due) || (avg_source_sad_lag > 3 * sad_thresh1 >> 3)) { rc->source_alt_ref_pending = 0; @@ -2198,12 +2251,12 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { rc->source_alt_ref_pending = 1; rc->alt_ref_gf_group = 1; // If alt-ref is used for this gf group, limit the interval. - if (rc->baseline_gf_interval > 10 && - rc->baseline_gf_interval < rc->frames_to_key) - rc->baseline_gf_interval = 10; + if (rc->baseline_gf_interval > 12) { + rc->baseline_gf_interval = 12; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + } } } -#endif target = calc_pframe_target_size_one_pass_vbr(cpi); vp9_rc_set_frame_target(cpi, target); } @@ -2233,11 +2286,14 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { int start_frame = 0; int frames_to_buffer = 1; int frame = 0; + int scene_cut_force_key_frame = 0; uint64_t avg_sad_current = 0; uint32_t min_thresh = 4000; float thresh = 8.0f; + uint32_t thresh_key = 140000; + if (cpi->oxcf.speed <= 5) thresh_key = 240000; if (cpi->oxcf.rc_mode == VPX_VBR) { - min_thresh = 60000; + min_thresh = 65000; thresh = 2.1f; } if (cpi->oxcf.lag_in_frames > 0) { @@ -2263,6 +2319,8 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { rc->high_source_sad = 1; else rc->high_source_sad = 0; + if (rc->high_source_sad && avg_sad_current > thresh_key) + scene_cut_force_key_frame = 1; // Update recursive average for current frame. if (avg_sad_current > 0) rc->avg_source_sad[0] = @@ -2323,6 +2381,8 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { rc->high_source_sad = 1; else rc->high_source_sad = 0; + if (rc->high_source_sad && avg_sad > thresh_key) + scene_cut_force_key_frame = 1; if (avg_sad > 0 || cpi->oxcf.rc_mode == VPX_CBR) rc->avg_source_sad[0] = (3 * rc->avg_source_sad[0] + avg_sad) >> 2; } else { @@ -2330,6 +2390,23 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { } } } + // For CBR non-screen content mode, check if we should reset the rate + // control. Reset is done if high_source_sad is detected and the rate + // control is at very low QP with rate correction factor at min level. + if (cpi->oxcf.rc_mode == VPX_CBR && + cpi->oxcf.content != VP9E_CONTENT_SCREEN && !cpi->use_svc) { + if (rc->high_source_sad && rc->last_q[INTER_FRAME] == rc->best_quality && + rc->avg_frame_qindex[INTER_FRAME] < (rc->best_quality << 1) && + rc->rate_correction_factors[INTER_NORMAL] == MIN_BPB_FACTOR) { + rc->rate_correction_factors[INTER_NORMAL] = 0.5; + rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality; + rc->buffer_level = rc->optimal_buffer_level; + rc->bits_off_target = rc->optimal_buffer_level; + rc->reset_high_source_sad = 1; + } + if (cm->frame_type != KEY_FRAME && rc->reset_high_source_sad) + rc->this_frame_target = rc->avg_frame_bandwidth; + } // For VBR, under scene change/high content change, force golden refresh. if (cpi->oxcf.rc_mode == VPX_VBR && cm->frame_type != KEY_FRAME && rc->high_source_sad && rc->frames_to_key > 3 && @@ -2337,10 +2414,10 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { cpi->ext_refresh_frame_flags_pending == 0) { int target; cpi->refresh_golden_frame = 1; + if (scene_cut_force_key_frame) cm->frame_type = KEY_FRAME; rc->source_alt_ref_pending = 0; -#if USE_ALTREF_FOR_ONE_PASS - if (cpi->oxcf.enable_auto_arf) rc->source_alt_ref_pending = 1; -#endif + if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) + rc->source_alt_ref_pending = 1; rc->gfu_boost = DEFAULT_GF_BOOST >> 1; rc->baseline_gf_interval = VPXMIN(20, VPXMAX(10, rc->baseline_gf_interval)); diff --git a/libvpx/vp9/encoder/vp9_ratectrl.h b/libvpx/vp9/encoder/vp9_ratectrl.h index 9e4623195..c1b210677 100644 --- a/libvpx/vp9/encoder/vp9_ratectrl.h +++ b/libvpx/vp9/encoder/vp9_ratectrl.h @@ -32,6 +32,8 @@ extern "C" { #define FIXED_GF_INTERVAL 8 // Used in some testing modes only #define ONEHALFONLY_RESIZE 0 +#define FRAME_OVERHEAD_BITS 200 + typedef enum { INTER_NORMAL = 0, INTER_HIGH = 1, @@ -150,6 +152,8 @@ typedef struct { int rc_2_frame; int q_1_frame; int q_2_frame; + // Keep track of the last target average frame bandwidth. + int last_avg_frame_bandwidth; // Auto frame-scaling variables. FRAME_SCALE_LEVEL frame_size_selector; @@ -164,11 +168,14 @@ typedef struct { uint64_t prev_avg_source_sad_lag; int high_source_sad_lagindex; int alt_ref_gf_group; + int last_frame_is_src_altref; int high_source_sad; int count_last_scene_change; int avg_frame_low_motion; int af_ratio_onepass_vbr; int force_qpmin; + int reset_high_source_sad; + double perc_arf_usage; } RATE_CONTROL; struct VP9_COMP; diff --git a/libvpx/vp9/encoder/vp9_rd.c b/libvpx/vp9/encoder/vp9_rd.c index 39a7742f0..6b2306ce9 100644 --- a/libvpx/vp9/encoder/vp9_rd.c +++ b/libvpx/vp9/encoder/vp9_rd.c @@ -670,19 +670,21 @@ void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh, } } -int vp9_get_intra_cost_penalty(int qindex, int qdelta, - vpx_bit_depth_t bit_depth) { - const int q = vp9_dc_quant(qindex, qdelta, bit_depth); -#if CONFIG_VP9_HIGHBITDEPTH - switch (bit_depth) { - case VPX_BITS_8: return 20 * q; - case VPX_BITS_10: return 5 * q; - case VPX_BITS_12: return ROUND_POWER_OF_TWO(5 * q, 2); - default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); - return -1; - } -#else - return 20 * q; -#endif // CONFIG_VP9_HIGHBITDEPTH +int vp9_get_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize, + int qindex, int qdelta) { + // Reduce the intra cost penalty for small blocks (<=16x16). + int reduction_fac = + (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0; + + if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh) + // Don't reduce intra cost penalty if estimated noise level is high. + reduction_fac = 0; + + // Always use VPX_BITS_8 as input here because the penalty is applied + // to rate not distortion so we want a consistent penalty for all bit + // depths. If the actual bit depth were passed in here then the value + // retured by vp9_dc_quant() would scale with the bit depth and we would + // then need to apply inverse scaling to correct back to a bit depth + // independent rate penalty. + return (20 * vp9_dc_quant(qindex, qdelta, VPX_BITS_8)) >> reduction_fac; } diff --git a/libvpx/vp9/encoder/vp9_rd.h b/libvpx/vp9/encoder/vp9_rd.h index 1e1176866..59022c106 100644 --- a/libvpx/vp9/encoder/vp9_rd.h +++ b/libvpx/vp9/encoder/vp9_rd.h @@ -191,13 +191,18 @@ void vp9_setup_pred_block(const MACROBLOCKD *xd, const struct scale_factors *scale, const struct scale_factors *scale_uv); -int vp9_get_intra_cost_penalty(int qindex, int qdelta, - vpx_bit_depth_t bit_depth); +int vp9_get_intra_cost_penalty(const struct VP9_COMP *const cpi, + BLOCK_SIZE bsize, int qindex, int qdelta); +unsigned int vp9_get_sby_variance(struct VP9_COMP *cpi, + const struct buf_2d *ref, BLOCK_SIZE bs); unsigned int vp9_get_sby_perpixel_variance(struct VP9_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs); #if CONFIG_VP9_HIGHBITDEPTH +unsigned int vp9_high_get_sby_variance(struct VP9_COMP *cpi, + const struct buf_2d *ref, BLOCK_SIZE bs, + int bd); unsigned int vp9_high_get_sby_perpixel_variance(struct VP9_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs, int bd); diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c index bf0fec3d8..2ba6378c5 100644 --- a/libvpx/vp9/encoder/vp9_rdopt.c +++ b/libvpx/vp9/encoder/vp9_rdopt.c @@ -600,7 +600,7 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16, - 32, NULL, 0, NULL, 0, bs, bs, xd->bd); + 32, NULL, 0, 0, 0, 0, bs, bs, xd->bd); if (xd->lossless) { vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd); } else { @@ -623,7 +623,7 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, recon = CONVERT_TO_BYTEPTR(recon16); } else { #endif // CONFIG_VP9_HIGHBITDEPTH - vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs); + vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, 0, 0, 0, bs, bs); switch (tx_size) { case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, *eob); break; case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, *eob); break; @@ -632,7 +632,7 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, // this is like vp9_short_idct4x4 but has a special case around // eob<=1, which is significant (not just an optimization) for // the lossless case. - x->itxm_add(dqcoeff, recon, 32, *eob); + x->inv_txfm_add(dqcoeff, recon, 32, *eob); break; default: assert(0 && "Invalid transform size"); break; } @@ -730,7 +730,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, } } else { // SKIP_TXFM_AC_DC - // skip forward transform + // skip forward transform. Because this is handled here, the quantization + // does not need to do it. x->plane[plane].eobs[block] = 0; sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4; dist = sse; @@ -1576,8 +1577,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x, k += (idy * 2 + idx); coeff_ctx = combine_entropy_contexts(ta[k & 1], tl[k >> 1]); coeff = BLOCK_OFFSET(p->coeff, k); - x->fwd_txm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff), - coeff, 8); + x->fwd_txfm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff), + coeff, 8); vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan); #if CONFIG_VP9_HIGHBITDEPTH thisdistortion += vp9_highbd_block_error_dispatch( @@ -2875,57 +2876,82 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost, // This function is designed to apply a bias or adjustment to an rd value based // on the relative variance of the source and reconstruction. -#define LOW_VAR_THRESH 16 -#define VLOW_ADJ_MAX 25 -#define VHIGH_ADJ_MAX 8 +#define VERY_LOW_VAR_THRESH 2 +#define LOW_VAR_THRESH 5 +#define VAR_MULT 100 +static unsigned int max_var_adjust[VP9E_CONTENT_INVALID] = { 16, 16, 100 }; + static void rd_variance_adjustment(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *this_rd, MV_REFERENCE_FRAME ref_frame, unsigned int source_variance) { MACROBLOCKD *const xd = &x->e_mbd; - unsigned int recon_variance; + unsigned int rec_variance; + unsigned int src_variance; + unsigned int src_rec_min; unsigned int absvar_diff = 0; - int64_t var_error = 0; - int64_t var_factor = 0; + unsigned int var_factor = 0; + unsigned int adj_max; + vp9e_tune_content content_type = cpi->oxcf.content; if (*this_rd == INT64_MAX) return; #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - recon_variance = vp9_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, + if (source_variance > 0) { + rec_variance = vp9_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize, xd->bd); + src_variance = source_variance; + } else { + rec_variance = + vp9_high_get_sby_variance(cpi, &xd->plane[0].dst, bsize, xd->bd); + src_variance = + vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, xd->bd); + } } else { - recon_variance = - vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize); + if (source_variance > 0) { + rec_variance = + vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize); + src_variance = source_variance; + } else { + rec_variance = vp9_get_sby_variance(cpi, &xd->plane[0].dst, bsize); + src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize); + } } #else - recon_variance = vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize); + if (source_variance > 0) { + rec_variance = vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize); + src_variance = source_variance; + } else { + rec_variance = vp9_get_sby_variance(cpi, &xd->plane[0].dst, bsize); + src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize); + } #endif // CONFIG_VP9_HIGHBITDEPTH - if ((source_variance + recon_variance) > LOW_VAR_THRESH) { - absvar_diff = (source_variance > recon_variance) - ? (source_variance - recon_variance) - : (recon_variance - source_variance); + // Lower of source (raw per pixel value) and recon variance. Note that + // if the source per pixel is 0 then the recon value here will not be per + // pixel (see above) so will likely be much larger. + src_rec_min = VPXMIN(source_variance, rec_variance); - var_error = ((int64_t)200 * source_variance * recon_variance) / - (((int64_t)source_variance * source_variance) + - ((int64_t)recon_variance * recon_variance)); - var_error = 100 - var_error; - } + if (src_rec_min > LOW_VAR_THRESH) return; + + absvar_diff = (src_variance > rec_variance) ? (src_variance - rec_variance) + : (rec_variance - src_variance); + + adj_max = max_var_adjust[content_type]; + + var_factor = + (unsigned int)((int64_t)VAR_MULT * absvar_diff) / VPXMAX(1, src_variance); + var_factor = VPXMIN(adj_max, var_factor); - // Source variance above a threshold and ref frame is intra. - // This case is targeted mainly at discouraging intra modes that give rise - // to a predictor with a low spatial complexity compared to the source. - if ((source_variance > LOW_VAR_THRESH) && (ref_frame == INTRA_FRAME) && - (source_variance > recon_variance)) { - var_factor = VPXMIN(absvar_diff, VPXMIN(VLOW_ADJ_MAX, var_error)); - // A second possible case of interest is where the source variance - // is very low and we wish to discourage false texture or motion trails. - } else if ((source_variance < (LOW_VAR_THRESH >> 1)) && - (recon_variance > source_variance)) { - var_factor = VPXMIN(absvar_diff, VPXMIN(VHIGH_ADJ_MAX, var_error)); - } *this_rd += (*this_rd * var_factor) / 100; + + if (content_type == VP9E_CONTENT_FILM) { + if (src_rec_min <= VERY_LOW_VAR_THRESH) { + if (ref_frame == INTRA_FRAME) *this_rd *= 2; + if (bsize > 6) *this_rd *= 2; + } + } } // Do we have an internal image edge (e.g. formatting bars). @@ -3037,8 +3063,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, int64_t dist_uv[TX_SIZES]; int skip_uv[TX_SIZES]; PREDICTION_MODE mode_uv[TX_SIZES]; - const int intra_cost_penalty = vp9_get_intra_cost_penalty( - cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); + const int intra_cost_penalty = + vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q); int best_skip2 = 0; uint8_t ref_frame_skip_mask[2] = { 0 }; uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 }; @@ -3801,8 +3827,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, int64_t dist_uv; int skip_uv; PREDICTION_MODE mode_uv = DC_PRED; - const int intra_cost_penalty = vp9_get_intra_cost_penalty( - cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); + const int intra_cost_penalty = + vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q); int_mv seg_mvs[4][MAX_REF_FRAMES]; b_mode_info best_bmodes[4]; int best_skip2 = 0; diff --git a/libvpx/vp9/encoder/vp9_skin_detection.c b/libvpx/vp9/encoder/vp9_skin_detection.c index 3f3d48fb9..cc6c96776 100644 --- a/libvpx/vp9/encoder/vp9_skin_detection.c +++ b/libvpx/vp9/encoder/vp9_skin_detection.c @@ -15,75 +15,6 @@ #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_skin_detection.h" -#define MODEL_MODE 1 - -// Fixed-point skin color model parameters. -static const int skin_mean[5][2] = { { 7463, 9614 }, - { 6400, 10240 }, - { 7040, 10240 }, - { 8320, 9280 }, - { 6800, 9614 } }; -static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 }; // q16 -static const int skin_threshold[6] = { 1570636, 1400000, 800000, - 800000, 800000, 800000 }; // q18 - -// Thresholds on luminance. -static const int y_low = 40; -static const int y_high = 220; - -// Evaluates the Mahalanobis distance measure for the input CbCr values. -static int evaluate_skin_color_difference(int cb, int cr, int idx) { - const int cb_q6 = cb << 6; - const int cr_q6 = cr << 6; - const int cb_diff_q12 = - (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]); - const int cbcr_diff_q12 = - (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]); - const int cr_diff_q12 = - (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]); - const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10; - const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10; - const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10; - const int skin_diff = - skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 + - skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2; - return skin_diff; -} - -int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr, - int motion) { - if (y < y_low || y > y_high) { - return 0; - } else { - if (MODEL_MODE == 0) { - return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]); - } else { - int i = 0; - // Exit on grey. - if (cb == 128 && cr == 128) return 0; - // Exit on very strong cb. - if (cb > 150 && cr < 110) return 0; - for (; i < 5; i++) { - int skin_color_diff = evaluate_skin_color_difference(cb, cr, i); - if (skin_color_diff < skin_threshold[i + 1]) { - if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) - return 0; - else if (motion == 0 && - skin_color_diff > (skin_threshold[i + 1] >> 1)) - return 0; - else - return 1; - } - // Exit if difference is much large than the threshold. - if (skin_color_diff > (skin_threshold[i + 1] << 3)) { - return 0; - } - } - return 0; - } - } -} - int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, int stride, int strideuv, int bsize, int consec_zeromv, int curr_motion_magn) { @@ -100,31 +31,113 @@ int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, const uint8_t ysource = y[y_height_shift * stride + y_width_shift]; const uint8_t usource = u[uv_height_shift * strideuv + uv_width_shift]; const uint8_t vsource = v[uv_height_shift * strideuv + uv_width_shift]; + if (consec_zeromv > 25 && curr_motion_magn == 0) motion = 0; - return vp9_skin_pixel(ysource, usource, vsource, motion); + return vpx_skin_pixel(ysource, usource, vsource, motion); + } +} + +void vp9_compute_skin_sb(VP9_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + int i, j, num_bl; + VP9_COMMON *const cm = &cpi->common; + const uint8_t *src_y = cpi->Source->y_buffer; + const uint8_t *src_u = cpi->Source->u_buffer; + const uint8_t *src_v = cpi->Source->v_buffer; + const int src_ystride = cpi->Source->y_stride; + const int src_uvstride = cpi->Source->uv_stride; + const int y_bsize = 4 << b_width_log2_lookup[bsize]; + const int uv_bsize = y_bsize >> 1; + const int shy = (y_bsize == 8) ? 3 : 4; + const int shuv = shy - 1; + const int fac = y_bsize / 8; + const int y_shift = src_ystride * (mi_row << 3) + (mi_col << 3); + const int uv_shift = src_uvstride * (mi_row << 2) + (mi_col << 2); + const int mi_row_limit = VPXMIN(mi_row + 8, cm->mi_rows - 2); + const int mi_col_limit = VPXMIN(mi_col + 8, cm->mi_cols - 2); + src_y += y_shift; + src_u += uv_shift; + src_v += uv_shift; + + for (i = mi_row; i < mi_row_limit; i += fac) { + num_bl = 0; + for (j = mi_col; j < mi_col_limit; j += fac) { + int consec_zeromv = 0; + int bl_index = i * cm->mi_cols + j; + int bl_index1 = bl_index + 1; + int bl_index2 = bl_index + cm->mi_cols; + int bl_index3 = bl_index2 + 1; + // Don't detect skin on the boundary. + if (i == 0 || j == 0) continue; + if (bsize == BLOCK_8X8) + consec_zeromv = cpi->consec_zero_mv[bl_index]; + else + consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index], + VPXMIN(cpi->consec_zero_mv[bl_index1], + VPXMIN(cpi->consec_zero_mv[bl_index2], + cpi->consec_zero_mv[bl_index3]))); + cpi->skin_map[bl_index] = + vp9_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride, + bsize, consec_zeromv, 0); + num_bl++; + src_y += y_bsize; + src_u += uv_bsize; + src_v += uv_bsize; + } + src_y += (src_ystride << shy) - (num_bl << shy); + src_u += (src_uvstride << shuv) - (num_bl << shuv); + src_v += (src_uvstride << shuv) - (num_bl << shuv); + } + + // Remove isolated skin blocks (none of its neighbors are skin) and isolated + // non-skin blocks (all of its neighbors are skin). + // Skip 4 corner blocks which have only 3 neighbors to remove isolated skin + // blocks. Skip superblock borders to remove isolated non-skin blocks. + for (i = mi_row; i < mi_row_limit; i += fac) { + for (j = mi_col; j < mi_col_limit; j += fac) { + int bl_index = i * cm->mi_cols + j; + int num_neighbor = 0; + int mi, mj; + int non_skin_threshold = 8; + // Skip 4 corners. + if ((i == mi_row && (j == mi_col || j == mi_col_limit - fac)) || + (i == mi_row_limit - fac && (j == mi_col || j == mi_col_limit - fac))) + continue; + // There are only 5 neighbors for non-skin blocks on the border. + if (i == mi_row || i == mi_row_limit - fac || j == mi_col || + j == mi_col_limit - fac) + non_skin_threshold = 5; + + for (mi = -fac; mi <= fac; mi += fac) { + for (mj = -fac; mj <= fac; mj += fac) { + if (i + mi >= mi_row && i + mi < mi_row_limit && j + mj >= mi_col && + j + mj < mi_col_limit) { + int bl_neighbor_index = (i + mi) * cm->mi_cols + j + mj; + if (cpi->skin_map[bl_neighbor_index]) num_neighbor++; + } + } + } + + if (cpi->skin_map[bl_index] && num_neighbor < 2) + cpi->skin_map[bl_index] = 0; + if (!cpi->skin_map[bl_index] && num_neighbor == non_skin_threshold) + cpi->skin_map[bl_index] = 1; + } } } #ifdef OUTPUT_YUV_SKINMAP // For viewing skin map on input source. -void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) { +void vp9_output_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) { int i, j, mi_row, mi_col, num_bl; VP9_COMMON *const cm = &cpi->common; uint8_t *y; const uint8_t *src_y = cpi->Source->y_buffer; - const uint8_t *src_u = cpi->Source->u_buffer; - const uint8_t *src_v = cpi->Source->v_buffer; const int src_ystride = cpi->Source->y_stride; - const int src_uvstride = cpi->Source->uv_stride; - int y_bsize = 16; // Use 8x8 or 16x16. - int uv_bsize = y_bsize >> 1; - int ypos = y_bsize >> 1; - int uvpos = uv_bsize >> 1; - int shy = (y_bsize == 8) ? 3 : 4; - int shuv = shy - 1; - int fac = y_bsize / 8; - // Use center pixel or average of center 2x2 pixels. - int mode_filter = 0; + const int y_bsize = 16; // Use 8x8 or 16x16. + const int shy = (y_bsize == 8) ? 3 : 4; + const int fac = y_bsize / 8; + YV12_BUFFER_CONFIG skinmap; memset(&skinmap, 0, sizeof(YV12_BUFFER_CONFIG)); if (vpx_alloc_frame_buffer(&skinmap, cm->width, cm->height, cm->subsampling_x, @@ -141,65 +154,21 @@ void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) { for (mi_row = 0; mi_row < cm->mi_rows - 1; mi_row += fac) { num_bl = 0; for (mi_col = 0; mi_col < cm->mi_cols - 1; mi_col += fac) { - int is_skin = 0; - if (mode_filter == 1) { - // Use 2x2 average at center. - uint8_t ysource = src_y[ypos * src_ystride + ypos]; - uint8_t usource = src_u[uvpos * src_uvstride + uvpos]; - uint8_t vsource = src_v[uvpos * src_uvstride + uvpos]; - uint8_t ysource2 = src_y[(ypos + 1) * src_ystride + ypos]; - uint8_t usource2 = src_u[(uvpos + 1) * src_uvstride + uvpos]; - uint8_t vsource2 = src_v[(uvpos + 1) * src_uvstride + uvpos]; - uint8_t ysource3 = src_y[ypos * src_ystride + (ypos + 1)]; - uint8_t usource3 = src_u[uvpos * src_uvstride + (uvpos + 1)]; - uint8_t vsource3 = src_v[uvpos * src_uvstride + (uvpos + 1)]; - uint8_t ysource4 = src_y[(ypos + 1) * src_ystride + (ypos + 1)]; - uint8_t usource4 = src_u[(uvpos + 1) * src_uvstride + (uvpos + 1)]; - uint8_t vsource4 = src_v[(uvpos + 1) * src_uvstride + (uvpos + 1)]; - ysource = (ysource + ysource2 + ysource3 + ysource4) >> 2; - usource = (usource + usource2 + usource3 + usource4) >> 2; - vsource = (vsource + vsource2 + vsource3 + vsource4) >> 2; - is_skin = vp9_skin_pixel(ysource, usource, vsource, 1); - } else { - int block_size = BLOCK_8X8; - int consec_zeromv = 0; - int bl_index = mi_row * cm->mi_cols + mi_col; - int bl_index1 = bl_index + 1; - int bl_index2 = bl_index + cm->mi_cols; - int bl_index3 = bl_index2 + 1; - if (y_bsize == 8) - consec_zeromv = cpi->consec_zero_mv[bl_index]; - else - consec_zeromv = - VPXMIN(cpi->consec_zero_mv[bl_index], - VPXMIN(cpi->consec_zero_mv[bl_index1], - VPXMIN(cpi->consec_zero_mv[bl_index2], - cpi->consec_zero_mv[bl_index3]))); - if (y_bsize == 16) block_size = BLOCK_16X16; - is_skin = - vp9_compute_skin_block(src_y, src_u, src_v, src_ystride, - src_uvstride, block_size, consec_zeromv, 0); - } + const int block_index = mi_row * cm->mi_cols + mi_col; + const int is_skin = cpi->skin_map[block_index]; for (i = 0; i < y_bsize; i++) { for (j = 0; j < y_bsize; j++) { - if (is_skin) - y[i * src_ystride + j] = 255; - else - y[i * src_ystride + j] = src_y[i * src_ystride + j]; + y[i * src_ystride + j] = is_skin ? 255 : src_y[i * src_ystride + j]; } } num_bl++; y += y_bsize; src_y += y_bsize; - src_u += uv_bsize; - src_v += uv_bsize; } y += (src_ystride << shy) - (num_bl << shy); src_y += (src_ystride << shy) - (num_bl << shy); - src_u += (src_uvstride << shuv) - (num_bl << shuv); - src_v += (src_uvstride << shuv) - (num_bl << shuv); } - vp9_write_yuv_frame_420(&skinmap, yuv_skinmap_file); + vpx_write_yuv_frame(yuv_skinmap_file, &skinmap); vpx_free_frame_buffer(&skinmap); } #endif diff --git a/libvpx/vp9/encoder/vp9_skin_detection.h b/libvpx/vp9/encoder/vp9_skin_detection.h index c77382dbd..8880bff46 100644 --- a/libvpx/vp9/encoder/vp9_skin_detection.h +++ b/libvpx/vp9/encoder/vp9_skin_detection.h @@ -12,6 +12,8 @@ #define VP9_ENCODER_VP9_SKIN_MAP_H_ #include "vp9/common/vp9_blockd.h" +#include "vpx_dsp/skin_detection.h" +#include "vpx_util/vpx_write_yuv_frame.h" #ifdef __cplusplus extern "C" { @@ -19,19 +21,16 @@ extern "C" { struct VP9_COMP; -// #define OUTPUT_YUV_SKINMAP - -int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr, - int motion); - int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, int stride, int strideuv, int bsize, int consec_zeromv, int curr_motion_magn); +void vp9_compute_skin_sb(struct VP9_COMP *const cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col); + #ifdef OUTPUT_YUV_SKINMAP // For viewing skin map on input source. -void vp9_compute_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file); -extern void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f); +void vp9_output_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file); #endif #ifdef __cplusplus diff --git a/libvpx/vp9/encoder/vp9_speed_features.c b/libvpx/vp9/encoder/vp9_speed_features.c index 8d9e2e8c3..a05db60c6 100644 --- a/libvpx/vp9/encoder/vp9_speed_features.c +++ b/libvpx/vp9/encoder/vp9_speed_features.c @@ -157,6 +157,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, VP9_COMMON *cm, SPEED_FEATURES *sf, int speed) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; const int boosted = frame_is_boosted(cpi); int i; @@ -182,7 +183,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, } if (speed >= 1) { - if (cpi->oxcf.pass == 2) { + if (oxcf->pass == 2) { TWO_PASS *const twopass = &cpi->twopass; if ((twopass->fr_content_type == FC_GRAPHICS_ANIMATION) || vp9_internal_image_edge(cpi)) { @@ -225,12 +226,16 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, } if (speed >= 2) { - sf->recode_loop = ALLOW_RECODE_KFARFGF; + if (oxcf->vbr_corpus_complexity) + sf->recode_loop = ALLOW_RECODE_FIRST; + else + sf->recode_loop = ALLOW_RECODE_KFARFGF; + sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL; // Reference masking is not supported in dynamic scaling mode. - sf->reference_masking = cpi->oxcf.resize_mode != RESIZE_DYNAMIC ? 1 : 0; + sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC ? 1 : 0; sf->mode_search_skip_flags = (cm->frame_type == KEY_FRAME) @@ -240,7 +245,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->disable_filter_search_var_thresh = 100; sf->comp_inter_joint_search_thresh = BLOCK_SIZES; sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; - sf->allow_partition_search_skip = 1; sf->recode_tolerance_low = 15; sf->recode_tolerance_high = 45; @@ -271,6 +275,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC; sf->adaptive_interp_filter_search = 1; + sf->allow_partition_search_skip = 1; if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { for (i = 0; i < MAX_MESH_STEP; ++i) { @@ -364,6 +369,11 @@ static void set_rt_speed_feature_framesize_independent( sf->copy_partition_flag = 0; sf->use_source_sad = 0; sf->use_simple_block_yrd = 0; + sf->adapt_partition_source_sad = 0; + sf->use_altref_onepass = 0; + sf->use_compound_nonrd_pickmode = 0; + sf->nonrd_keyframe = 0; + sf->svc_use_lowres_part = 0; if (speed >= 1) { sf->allow_txfm_domain_distortion = 1; @@ -441,6 +451,8 @@ static void set_rt_speed_feature_framesize_independent( if (speed >= 4) { int i; + if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0) + sf->use_altref_onepass = 1; sf->last_partitioning_redo_frequency = 4; sf->adaptive_rd_thresh = 5; sf->use_fast_coef_costing = 0; @@ -466,6 +478,7 @@ static void set_rt_speed_feature_framesize_independent( } if (speed >= 5) { + sf->use_altref_onepass = 0; sf->use_quant_fp = !is_keyframe; sf->auto_min_max_partition_size = is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX; @@ -521,12 +534,30 @@ static void set_rt_speed_feature_framesize_independent( } if (speed >= 6) { + if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0) { + sf->use_altref_onepass = 1; + sf->use_compound_nonrd_pickmode = 1; + } sf->partition_search_type = VAR_BASED_PARTITION; // Turn on this to use non-RD key frame coding mode. sf->use_nonrd_pick_mode = 1; sf->mv.search_method = NSTEP; sf->mv.reduce_first_step_size = 1; sf->skip_encode_sb = 0; + + if (!cpi->external_resize) sf->use_source_sad = 1; + + if (sf->use_source_sad) { + sf->adapt_partition_source_sad = 1; + sf->adapt_partition_thresh = + (cm->width * cm->height <= 640 * 360) ? 40000 : 60000; + if (cpi->content_state_sb_fd == NULL && + (!cpi->use_svc || + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { + cpi->content_state_sb_fd = (uint8_t *)vpx_calloc( + (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t)); + } + } if (cpi->oxcf.rc_mode == VPX_CBR && content != VP9E_CONTENT_SCREEN) { // Enable short circuit for low temporal variance. sf->short_circuit_low_temp_var = 1; @@ -534,53 +565,64 @@ static void set_rt_speed_feature_framesize_independent( if (cpi->svc.temporal_layer_id > 0) { sf->adaptive_rd_thresh = 4; sf->limit_newmv_early_exit = 0; - sf->mv.subpel_force_stop = (cpi->svc.temporal_layer_id == 1) ? 1 : 2; - sf->base_mv_aggressive = - (cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1) - ? 1 - : 0; + sf->base_mv_aggressive = 1; } } if (speed >= 7) { + sf->adapt_partition_source_sad = 0; sf->adaptive_rd_thresh = 3; sf->mv.search_method = FAST_DIAMOND; sf->mv.fullpel_search_step_param = 10; + // For SVC: use better mv search on base temporal layer, and only + // on base spatial layer if highest resolution is above 640x360. if (cpi->svc.number_temporal_layers > 2 && - cpi->svc.temporal_layer_id == 0) { + cpi->svc.temporal_layer_id == 0 && + (cpi->svc.spatial_layer_id == 0 || + cpi->oxcf.width * cpi->oxcf.height <= 640 * 360)) { sf->mv.search_method = NSTEP; sf->mv.fullpel_search_step_param = 6; } - if (!cpi->external_resize) sf->use_source_sad = 1; - if (sf->use_source_sad) { - if (cpi->content_state_sb_fd == NULL && - (!cpi->use_svc || - cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { - cpi->content_state_sb_fd = (uint8_t *)vpx_calloc( - (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t)); - } + if (cpi->svc.temporal_layer_id > 0 || cpi->svc.spatial_layer_id > 1) { + sf->use_simple_block_yrd = 1; + if (cpi->svc.non_reference_frame) + sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE; } - } - - if (speed >= 8) { - sf->adaptive_rd_thresh = 4; - // Enable partition copy. For SVC, only enabled for top resolution layer, + if (cpi->use_svc && cpi->row_mt && cpi->oxcf.max_threads > 1) + sf->adaptive_rd_thresh_row_mt = 1; + // Enable partition copy. For SVC only enabled for top spatial resolution + // layer. + cpi->max_copied_frame = 0; if (!cpi->last_frame_dropped && cpi->resize_state == ORIG && !cpi->external_resize && (!cpi->use_svc || cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { sf->copy_partition_flag = 1; - cpi->max_copied_frame = 4; + cpi->max_copied_frame = 2; + // The top temporal enhancement layer (for number of temporal layers > 1) + // are non-reference frames, so use large/max value for max_copied_frame. + if (cpi->svc.number_temporal_layers > 1 && + cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1) + cpi->max_copied_frame = 255; } + // For SVC: enable use of lower resolution partition for higher resolution, + // only for 3 spatial layers and when config/top resolution is above VGA. + // Enable only for non-base temporal layer frames. + if (cpi->use_svc && cpi->svc.number_spatial_layers == 3 && + cpi->svc.temporal_layer_id > 0 && + cpi->oxcf.width * cpi->oxcf.height > 640 * 480) + sf->svc_use_lowres_part = 1; + } + if (speed >= 8) { + sf->adaptive_rd_thresh = 4; + sf->skip_encode_sb = 1; + sf->nonrd_keyframe = 1; + if (!cpi->use_svc) cpi->max_copied_frame = 4; if (cpi->row_mt && cpi->oxcf.max_threads > 1) sf->adaptive_rd_thresh_row_mt = 1; - if (content == VP9E_CONTENT_SCREEN) - sf->mv.subpel_force_stop = 3; - else if (cm->width * cm->height > 352 * 288) - sf->mv.subpel_force_stop = 2; - + if (content == VP9E_CONTENT_SCREEN) sf->mv.subpel_force_stop = 3; if (content == VP9E_CONTENT_SCREEN) sf->lpf_pick = LPF_PICK_MINIMAL_LPF; // Only keep INTRA_DC mode for speed 8. if (!is_keyframe) { @@ -610,6 +652,20 @@ static void set_rt_speed_feature_framesize_independent( sf->limit_newmv_early_exit = 0; sf->use_simple_block_yrd = 1; } + if (sf->use_altref_onepass) { + if (cpi->rc.is_src_frame_alt_ref && cm->frame_type != KEY_FRAME) { + sf->partition_search_type = FIXED_PARTITION; + sf->always_this_block_size = BLOCK_64X64; + } + if (cpi->count_arf_frame_usage == NULL) + cpi->count_arf_frame_usage = + (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), + sizeof(*cpi->count_arf_frame_usage)); + if (cpi->count_lastgolden_frame_usage == NULL) + cpi->count_lastgolden_frame_usage = + (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), + sizeof(*cpi->count_lastgolden_frame_usage)); + } } void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) { @@ -651,7 +707,8 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) { // and multiple threads match. // It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since // adaptive_rd_thresh is defined per-row for non-rd pickmode. - if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact) + if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact && + oxcf->max_threads > 1) sf->adaptive_rd_thresh = 0; // This is only used in motion vector unit test. @@ -768,7 +825,6 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { else if (oxcf->mode == GOOD) set_good_speed_feature_framesize_independent(cpi, cm, sf, oxcf->speed); - cpi->full_search_sad = vp9_full_search_sad; cpi->diamond_search_sad = vp9_diamond_search_sad; // Slow quant, dct and trellis not worthwhile for first pass @@ -808,7 +864,8 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { // and multiple threads match. // It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since // adaptive_rd_thresh is defined per-row for non-rd pickmode. - if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact) + if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact && + oxcf->max_threads > 1) sf->adaptive_rd_thresh = 0; // This is only used in motion vector unit test. diff --git a/libvpx/vp9/encoder/vp9_speed_features.h b/libvpx/vp9/encoder/vp9_speed_features.h index ee485a35f..50d52bc23 100644 --- a/libvpx/vp9/encoder/vp9_speed_features.h +++ b/libvpx/vp9/encoder/vp9_speed_features.h @@ -490,6 +490,24 @@ typedef struct SPEED_FEATURES { int use_source_sad; int use_simple_block_yrd; + + // If source sad of superblock is high (> adapt_partition_thresh), will switch + // from VARIANCE_PARTITION to REFERENCE_PARTITION (which selects partition + // based on the nonrd-pickmode). + int adapt_partition_source_sad; + int adapt_partition_thresh; + + // Enable use of alt-refs in 1 pass VBR. + int use_altref_onepass; + + // Enable use of compound prediction, for nonrd_pickmode with nonzero lag. + int use_compound_nonrd_pickmode; + + // Always use nonrd_pick_intra for all block sizes on keyframes. + int nonrd_keyframe; + + // For SVC: enables use of partition from lower spatial resolution. + int svc_use_lowres_part; } SPEED_FEATURES; struct VP9_COMP; diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.c b/libvpx/vp9/encoder/vp9_svc_layercontext.c index 5867a6c38..2636bd9a5 100644 --- a/libvpx/vp9/encoder/vp9_svc_layercontext.c +++ b/libvpx/vp9/encoder/vp9_svc_layercontext.c @@ -36,6 +36,8 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { svc->scaled_temp_is_alloc = 0; svc->scaled_one_half = 0; svc->current_superframe = 0; + svc->non_reference_frame = 0; + for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1; for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { svc->ext_frame_flags[sl] = 0; @@ -173,7 +175,7 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, RATE_CONTROL *const lrc = &lc->rc; lc->spatial_layer_target_bandwidth = spatial_layer_target; - bitrate_alloc = (float)lc->target_bandwidth / spatial_layer_target; + bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; lrc->starting_buffer_level = (int64_t)(rc->starting_buffer_level * bitrate_alloc); lrc->optimal_buffer_level = @@ -351,6 +353,7 @@ void vp9_save_layer_context(VP9_COMP *const cpi) { } } +#if !CONFIG_REALTIME_ONLY void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) { SVC *const svc = &cpi->svc; int i; @@ -366,6 +369,7 @@ void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) { } svc->spatial_layer_id = 0; } +#endif // !CONFIG_REALTIME_ONLY void vp9_inc_frame_in_layer(VP9_COMP *const cpi) { LAYER_CONTEXT *const lc = @@ -386,9 +390,9 @@ int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) { .is_key_frame; } -static void get_layer_resolution(const int width_org, const int height_org, - const int num, const int den, int *width_out, - int *height_out) { +void get_layer_resolution(const int width_org, const int height_org, + const int num, const int den, int *width_out, + int *height_out) { int w, h; if (width_out == NULL || height_out == NULL || den == 0) return; @@ -603,6 +607,7 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { LAYER_CONTEXT *lc = NULL; if (cpi->svc.number_spatial_layers > 1) cpi->svc.use_base_mv = 1; cpi->svc.force_zero_mode_spatial_ref = 1; + cpi->svc.mi_stride[cpi->svc.spatial_layer_id] = cpi->common.mi_stride; if (cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) { set_flags_and_fb_idx_for_temporal_mode3(cpi); @@ -652,9 +657,9 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { lc->scaling_factor_num, lc->scaling_factor_den, &width, &height); - // For low resolutions: set phase of the filter = 8 (for symmetric averaging - // filter), use bilinear for now. - if (width <= 320 && height <= 240) { + // For resolutions <= VGA: set phase of the filter = 8 (for symmetric + // averaging filter), use bilinear for now. + if (width * height <= 640 * 480) { cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] = BILINEAR; cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id] = 8; } @@ -677,6 +682,12 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { } } + cpi->svc.non_reference_frame = 0; + if (cpi->common.frame_type != KEY_FRAME && !cpi->ext_refresh_last_frame && + !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame) { + cpi->svc.non_reference_frame = 1; + } + if (vp9_set_size_literal(cpi, width, height) != 0) return VPX_CODEC_INVALID_PARAM; @@ -851,3 +862,28 @@ void vp9_svc_reset_key_frame(VP9_COMP *const cpi) { vp9_update_temporal_layer_framerate(cpi); vp9_restore_layer_context(cpi); } + +void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) { + SVC *svc = &cpi->svc; + int sl, tl; + for (sl = 0; sl < svc->number_spatial_layers; ++sl) { + // Check for reset based on avg_frame_bandwidth for spatial layer sl. + int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + if (lrc->avg_frame_bandwidth > (3 * lrc->last_avg_frame_bandwidth >> 1) || + lrc->avg_frame_bandwidth < (lrc->last_avg_frame_bandwidth >> 1)) { + // Reset for all temporal layers with spatial layer sl. + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + lrc->rc_1_frame = 0; + lrc->rc_2_frame = 0; + lrc->bits_off_target = lrc->optimal_buffer_level; + lrc->buffer_level = lrc->optimal_buffer_level; + } + } + } +} diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.h b/libvpx/vp9/encoder/vp9_svc_layercontext.h index d8e6772b2..b7cdfd962 100644 --- a/libvpx/vp9/encoder/vp9_svc_layercontext.h +++ b/libvpx/vp9/encoder/vp9_svc_layercontext.h @@ -49,7 +49,7 @@ typedef struct { uint8_t speed; } LAYER_CONTEXT; -typedef struct { +typedef struct SVC { int spatial_layer_id; int temporal_layer_id; int number_spatial_layers; @@ -87,6 +87,7 @@ typedef struct { int ref_frame_index[REF_FRAMES]; int force_zero_mode_spatial_ref; int current_superframe; + int non_reference_frame; int use_base_mv; // Used to control the downscaling filter for source scaling, for 1 pass CBR. // downsample_filter_phase: = 0 will do sub-sampling (no weighted average), @@ -95,6 +96,11 @@ typedef struct { // eighttap_smooth, eighttap_sharp, and bilinear. INTERP_FILTER downsample_filter_type[VPX_SS_MAX_LAYERS]; int downsample_filter_phase[VPX_SS_MAX_LAYERS]; + + BLOCK_SIZE *prev_partition_svc; + int mi_stride[VPX_MAX_LAYERS]; + + int first_layer_denoise; } SVC; struct VP9_COMP; @@ -124,6 +130,10 @@ void vp9_save_layer_context(struct VP9_COMP *const cpi); // Initialize second pass rc for spatial svc. void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi); +void get_layer_resolution(const int width_org, const int height_org, + const int num, const int den, int *width_out, + int *height_out); + // Increment number of video frames in layer void vp9_inc_frame_in_layer(struct VP9_COMP *const cpi); @@ -144,6 +154,8 @@ void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi); void vp9_svc_reset_key_frame(struct VP9_COMP *const cpi); +void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi); + #ifdef __cplusplus } // extern "C" #endif diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.c b/libvpx/vp9/encoder/vp9_temporal_filter.c index 630794156..2758c42ae 100644 --- a/libvpx/vp9/encoder/vp9_temporal_filter.c +++ b/libvpx/vp9/encoder/vp9_temporal_filter.c @@ -350,6 +350,27 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, td->mb.mv_limits.col_max = ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * VP9_INTERP_EXTEND); + if (cpi->oxcf.content == VP9E_CONTENT_FILM) { + unsigned int src_variance; + struct buf_2d src; + + src.buf = f->y_buffer + mb_y_offset; + src.stride = f->y_stride; + +#if CONFIG_VP9_HIGHBITDEPTH + if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + src_variance = + vp9_high_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16, mbd->bd); + } else { + src_variance = vp9_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16); + } +#else + src_variance = vp9_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16); +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (src_variance <= 2) strength = VPXMAX(0, (int)strength - 2); + } + for (frame = 0; frame < frame_count; frame++) { const uint32_t thresh_low = 10000; const uint32_t thresh_high = 20000; diff --git a/libvpx/vp9/encoder/x86/temporal_filter_sse4.c b/libvpx/vp9/encoder/x86/temporal_filter_sse4.c index be4cd8685..460dab659 100644 --- a/libvpx/vp9/encoder/x86/temporal_filter_sse4.c +++ b/libvpx/vp9/encoder/x86/temporal_filter_sse4.c @@ -11,6 +11,7 @@ #include <assert.h> #include <smmintrin.h> +#include "./vp9_rtcd.h" #include "./vpx_config.h" #include "vpx/vpx_integer.h" diff --git a/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c index 09a1e48fc..dbd243ac1 100644 --- a/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c @@ -15,6 +15,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/txfm_common.h" #include "vpx_dsp/x86/fwd_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" #include "vpx_ports/mem.h" @@ -71,7 +72,7 @@ static INLINE void transpose_4x4(__m128i *res) { } static void fdct4_sse2(__m128i *in) { - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); @@ -193,7 +194,7 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); @@ -706,61 +707,9 @@ static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res, store_output(&res[7], (output + 7 * stride)); } -// perform in-place transpose -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 44 54 45 55 46 56 47 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 25 35 - // 44 54 64 74 45 55 65 75 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 -} - static void fdct8_sse2(__m128i *in) { // constants - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); @@ -895,7 +844,7 @@ static void fdct8_sse2(__m128i *in) { in[7] = _mm_packs_epi32(v6, v7); // transpose - array_transpose_8x8(in, in); + transpose_16bit_8x8(in, in); } static void fadst8_sse2(__m128i *in) { @@ -912,7 +861,7 @@ static void fadst8_sse2(__m128i *in) { const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__const_0 = _mm_set1_epi16(0); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); @@ -1125,7 +1074,7 @@ static void fadst8_sse2(__m128i *in) { in[7] = _mm_sub_epi16(k__const_0, s1); // transpose - array_transpose_8x8(in, in); + transpose_16bit_8x8(in, in); } void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, @@ -1182,23 +1131,6 @@ static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0, write_buffer_8x8(output + 8 * stride, in1 + 8, stride); } -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { // perform rounding operations right_shift_8x8(res0, 2); @@ -1210,7 +1142,7 @@ static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { static void fdct16_8col(__m128i *in) { // perform 16x16 1-D DCT for 8 columns __m128i i[8], s[8], p[8], t[8], u[16], v[16]; - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); @@ -1557,8 +1489,8 @@ static void fadst16_8col(__m128i *in) { const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); @@ -2002,13 +1934,13 @@ static void fadst16_8col(__m128i *in) { static void fdct16_sse2(__m128i *in0, __m128i *in1) { fdct16_8col(in0); fdct16_8col(in1); - array_transpose_16x16(in0, in1); + transpose_16bit_16x16(in0, in1); } static void fadst16_sse2(__m128i *in0, __m128i *in1) { fadst16_8col(in0); fadst16_8col(in1); - array_transpose_16x16(in0, in1); + transpose_16bit_16x16(in0, in1); } void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, diff --git a/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c b/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c index db57ee1f1..bf874a09e 100644 --- a/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c +++ b/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c @@ -31,7 +31,7 @@ void vp9_fdct8x8_quant_ssse3( // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); diff --git a/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c b/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c index 91d0602f9..5930bf491 100644 --- a/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c @@ -13,7 +13,6 @@ #include "./vpx_config.h" #include "./vp9_rtcd.h" -#include "vpx_ports/emmintrin_compat.h" #include "vpx/vpx_integer.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/encoder/vp9_context_tree.h" diff --git a/libvpx/vp9/encoder/x86/vp9_error_avx2.c b/libvpx/vp9/encoder/x86/vp9_error_avx2.c index e228bd8b7..99fef31d1 100644 --- a/libvpx/vp9/encoder/x86/vp9_error_avx2.c +++ b/libvpx/vp9/encoder/x86/vp9_error_avx2.c @@ -1,7 +1,7 @@ /* * Copyright (c) 2014 The WebM project authors. All Rights Reserved. * - * Usee of this source code is governed by a BSD-style license + * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may @@ -105,3 +105,57 @@ int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, _mm_storel_epi64((__m128i *)(ssz), ssz_128); return sse; } + +int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, + const tran_low_t *dqcoeff, int block_size) { + int i; + const __m256i zero = _mm256_setzero_si256(); + __m256i sse_256 = zero; + __m256i sse_hi; + __m128i sse_128; + int64_t sse; + + if (block_size == 16) { + // Load 16 elements for coeff and dqcoeff. + const __m256i _coeff = load_tran_low(coeff); + const __m256i _dqcoeff = load_tran_low(dqcoeff); + // dqcoeff - coeff + const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); + // madd (dqcoeff - coeff) + const __m256i error_lo = _mm256_madd_epi16(diff, diff); + // Save the higher 64 bit of each 128 bit lane. + const __m256i error_hi = _mm256_srli_si256(error_lo, 8); + // Add the higher 64 bit to the low 64 bit. + const __m256i error = _mm256_add_epi32(error_lo, error_hi); + // Expand each double word in the lower 64 bits to quad word. + sse_256 = _mm256_unpacklo_epi32(error, zero); + } else { + for (i = 0; i < block_size; i += 16) { + // Load 16 elements for coeff and dqcoeff. + const __m256i _coeff = load_tran_low(coeff); + const __m256i _dqcoeff = load_tran_low(dqcoeff); + const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); + const __m256i error = _mm256_madd_epi16(diff, diff); + // Expand each double word of madd (dqcoeff - coeff) to quad word. + const __m256i exp_error_lo = _mm256_unpacklo_epi32(error, zero); + const __m256i exp_error_hi = _mm256_unpackhi_epi32(error, zero); + // Add each quad word of madd (dqcoeff - coeff). + sse_256 = _mm256_add_epi64(sse_256, exp_error_lo); + sse_256 = _mm256_add_epi64(sse_256, exp_error_hi); + coeff += 16; + dqcoeff += 16; + } + } + // Save the higher 64 bit of each 128 bit lane. + sse_hi = _mm256_srli_si256(sse_256, 8); + // Add the higher 64 bit to the low 64 bit. + sse_256 = _mm256_add_epi64(sse_256, sse_hi); + + // Add each 64 bit from each of the 128 bit lane of the 256 bit. + sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256), + _mm256_extractf128_si256(sse_256, 1)); + + // Store the results. + _mm_storel_epi64((__m128i *)&sse, sse_128); + return sse; +} diff --git a/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c index b53714a02..7685e7bc3 100644 --- a/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c +++ b/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c @@ -13,159 +13,738 @@ #include "./vp9_rtcd.h" #include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" +#include "vpx_dsp/x86/convolve_ssse3.h" +#include "vpx_dsp/x86/mem_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_scale/yv12config.h" -extern void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst, - uint8_t filter_type, int phase_scaler); +static INLINE __m128i scale_plane_2_to_1_phase_0_kernel( + const uint8_t *const src, const __m128i *const mask) { + const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0])); + const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16])); + const __m128i a_and = _mm_and_si128(a, *mask); + const __m128i b_and = _mm_and_si128(b, *mask); + return _mm_packus_epi16(a_and, b_and); +} -static void downsample_2_to_1_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, int w, - int h) { +static void scale_plane_2_to_1_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h) { + const int max_width = (dst_w + 15) & ~15; const __m128i mask = _mm_set1_epi16(0x00FF); - const int max_width = w & ~15; - int y; - for (y = 0; y < h; ++y) { - int x; - for (x = 0; x < max_width; x += 16) { - const __m128i a = _mm_loadu_si128((const __m128i *)(src + x * 2 + 0)); - const __m128i b = _mm_loadu_si128((const __m128i *)(src + x * 2 + 16)); - const __m128i a_and = _mm_and_si128(a, mask); - const __m128i b_and = _mm_and_si128(b, mask); - const __m128i c = _mm_packus_epi16(a_and, b_and); - _mm_storeu_si128((__m128i *)(dst + x), c); - } - for (; x < w; ++x) dst[x] = src[x * 2]; - src += src_stride * 2; - dst += dst_stride; - } + int y = dst_h; + + do { + int x = max_width; + do { + const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask); + _mm_storeu_si128((__m128i *)dst, d); + src += 32; + dst += 16; + x -= 16; + } while (x); + src += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); } -static INLINE __m128i filter(const __m128i *const a, const __m128i *const b, - const __m128i *const c, const __m128i *const d, - const __m128i *const e, const __m128i *const f, - const __m128i *const g, const __m128i *const h) { - const __m128i coeffs_ab = - _mm_set_epi8(6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1); - const __m128i coeffs_cd = _mm_set_epi8(78, -19, 78, -19, 78, -19, 78, -19, 78, - -19, 78, -19, 78, -19, 78, -19); - const __m128i const64_x16 = _mm_set1_epi16(64); - const __m128i ab = _mm_unpacklo_epi8(*a, *b); - const __m128i cd = _mm_unpacklo_epi8(*c, *d); - const __m128i fe = _mm_unpacklo_epi8(*f, *e); - const __m128i hg = _mm_unpacklo_epi8(*h, *g); - const __m128i ab_terms = _mm_maddubs_epi16(ab, coeffs_ab); - const __m128i cd_terms = _mm_maddubs_epi16(cd, coeffs_cd); - const __m128i fe_terms = _mm_maddubs_epi16(fe, coeffs_cd); - const __m128i hg_terms = _mm_maddubs_epi16(hg, coeffs_ab); - // can not overflow - const __m128i abcd_terms = _mm_add_epi16(ab_terms, cd_terms); - // can not overflow - const __m128i fehg_terms = _mm_add_epi16(fe_terms, hg_terms); - // can overflow, use saturating add - const __m128i terms = _mm_adds_epi16(abcd_terms, fehg_terms); - const __m128i round = _mm_adds_epi16(terms, const64_x16); - const __m128i shift = _mm_srai_epi16(round, 7); - return _mm_packus_epi16(shift, shift); +static void scale_plane_4_to_1_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h) { + const int max_width = (dst_w + 15) & ~15; + const __m128i mask = _mm_set1_epi32(0x000000FF); + int y = dst_h; + + do { + int x = max_width; + do { + const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask); + const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask); + const __m128i d2 = _mm_packus_epi16(d0, d1); + _mm_storeu_si128((__m128i *)dst, d2); + src += 64; + dst += 16; + x -= 16; + } while (x); + src += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); } -static void eight_tap_row_ssse3(const uint8_t *src, uint8_t *dst, int w) { - const int max_width = w & ~7; - int x = 0; - for (; x < max_width; x += 8) { - const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x + 0)); - const __m128i b = _mm_loadl_epi64((const __m128i *)(src + x + 1)); - const __m128i c = _mm_loadl_epi64((const __m128i *)(src + x + 2)); - const __m128i d = _mm_loadl_epi64((const __m128i *)(src + x + 3)); - const __m128i e = _mm_loadl_epi64((const __m128i *)(src + x + 4)); - const __m128i f = _mm_loadl_epi64((const __m128i *)(src + x + 5)); - const __m128i g = _mm_loadl_epi64((const __m128i *)(src + x + 6)); - const __m128i h = _mm_loadl_epi64((const __m128i *)(src + x + 7)); - const __m128i pack = filter(&a, &b, &c, &d, &e, &f, &g, &h); - _mm_storel_epi64((__m128i *)(dst + x), pack); - } +static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s, + const __m128i c0c1) { + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1); + const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1); + // round and shift by 7 bit each 16 bit + const __m128i t2 = _mm_adds_epi16(t0, k_64); + const __m128i t3 = _mm_adds_epi16(t1, k_64); + const __m128i t4 = _mm_srai_epi16(t2, 7); + const __m128i t5 = _mm_srai_epi16(t3, 7); + return _mm_packus_epi16(t4, t5); } -static void upsample_1_to_2_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, int dst_w, - int dst_h) { - dst_w /= 2; - dst_h /= 2; - { - DECLARE_ALIGNED(16, uint8_t, tmp[1920 * 8]); - uint8_t *tmp0 = tmp + dst_w * 0; - uint8_t *tmp1 = tmp + dst_w * 1; - uint8_t *tmp2 = tmp + dst_w * 2; - uint8_t *tmp3 = tmp + dst_w * 3; - uint8_t *tmp4 = tmp + dst_w * 4; - uint8_t *tmp5 = tmp + dst_w * 5; - uint8_t *tmp6 = tmp + dst_w * 6; - uint8_t *tmp7 = tmp + dst_w * 7; - uint8_t *tmp8 = NULL; - const int max_width = dst_w & ~7; - int y; - eight_tap_row_ssse3(src - src_stride * 3 - 3, tmp0, dst_w); - eight_tap_row_ssse3(src - src_stride * 2 - 3, tmp1, dst_w); - eight_tap_row_ssse3(src - src_stride * 1 - 3, tmp2, dst_w); - eight_tap_row_ssse3(src + src_stride * 0 - 3, tmp3, dst_w); - eight_tap_row_ssse3(src + src_stride * 1 - 3, tmp4, dst_w); - eight_tap_row_ssse3(src + src_stride * 2 - 3, tmp5, dst_w); - eight_tap_row_ssse3(src + src_stride * 3 - 3, tmp6, dst_w); - for (y = 0; y < dst_h; y++) { - int x; - eight_tap_row_ssse3(src + src_stride * 4 - 3, tmp7, dst_w); - for (x = 0; x < max_width; x += 8) { - const __m128i A = _mm_loadl_epi64((const __m128i *)(src + x)); - const __m128i B = _mm_loadl_epi64((const __m128i *)(tmp3 + x)); - const __m128i AB = _mm_unpacklo_epi8(A, B); - __m128i C, D, CD; - _mm_storeu_si128((__m128i *)(dst + x * 2), AB); - { - const __m128i a = - _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 3)); - const __m128i b = - _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 2)); - const __m128i c = - _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 1)); - const __m128i d = - _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 0)); - const __m128i e = - _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 1)); - const __m128i f = - _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 2)); - const __m128i g = - _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 3)); - const __m128i h = - _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 4)); - C = filter(&a, &b, &c, &d, &e, &f, &g, &h); - } - { - const __m128i a = _mm_loadl_epi64((const __m128i *)(tmp0 + x)); - const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp1 + x)); - const __m128i c = _mm_loadl_epi64((const __m128i *)(tmp2 + x)); - const __m128i d = _mm_loadl_epi64((const __m128i *)(tmp3 + x)); - const __m128i e = _mm_loadl_epi64((const __m128i *)(tmp4 + x)); - const __m128i f = _mm_loadl_epi64((const __m128i *)(tmp5 + x)); - const __m128i g = _mm_loadl_epi64((const __m128i *)(tmp6 + x)); - const __m128i h = _mm_loadl_epi64((const __m128i *)(tmp7 + x)); - D = filter(&a, &b, &c, &d, &e, &f, &g, &h); - } - CD = _mm_unpacklo_epi8(C, D); - _mm_storeu_si128((__m128i *)(dst + x * 2 + dst_stride), CD); - } - src += src_stride; - dst += dst_stride * 2; - tmp8 = tmp0; - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = tmp5; - tmp5 = tmp6; - tmp6 = tmp7; - tmp7 = tmp8; +static void scale_plane_2_to_1_bilinear(const uint8_t *src, + const ptrdiff_t src_stride, + uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h, + const __m128i c0c1) { + const int max_width = (dst_w + 15) & ~15; + int y = dst_h; + + do { + int x = max_width; + do { + __m128i s[2], d[2]; + + // Horizontal + // Even rows + s[0] = _mm_loadu_si128((const __m128i *)(src + 0)); + s[1] = _mm_loadu_si128((const __m128i *)(src + 16)); + d[0] = scale_plane_bilinear_kernel(s, c0c1); + + // odd rows + s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0)); + s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16)); + d[1] = scale_plane_bilinear_kernel(s, c0c1); + + // Vertical + s[0] = _mm_unpacklo_epi8(d[0], d[1]); + s[1] = _mm_unpackhi_epi8(d[0], d[1]); + d[0] = scale_plane_bilinear_kernel(s, c0c1); + + _mm_storeu_si128((__m128i *)dst, d[0]); + src += 32; + dst += 16; + x -= 16; + } while (x); + src += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_4_to_1_bilinear(const uint8_t *src, + const ptrdiff_t src_stride, + uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h, + const __m128i c0c1) { + const int max_width = (dst_w + 15) & ~15; + int y = dst_h; + + do { + int x = max_width; + do { + __m128i s[8], d[8]; + + // Note: Using _mm_packus_epi32() in SSE4.1 could be faster. + // Here we tried to not use shuffle instructions which would be slow + // on some x86 CPUs. + + // Horizontal + // 000 001 xx xx 004 005 xx xx 008 009 xx xx 00C 00D xx xx + // 010 011 xx xx 014 015 xx xx 018 019 xx xx 01C 01D xx xx + // 020 021 xx xx 024 025 xx xx 028 029 xx xx 02C 02D xx xx + // 030 031 xx xx 034 035 xx xx 038 039 xx xx 03C 03D xx xx + // 100 101 xx xx 104 105 xx xx 108 109 xx xx 10C 10D xx xx + // 110 111 xx xx 114 115 xx xx 118 119 xx xx 11C 11D xx xx + // 120 121 xx xx 124 125 xx xx 128 129 xx xx 12C 12D xx xx + // 130 131 xx xx 134 135 xx xx 138 139 xx xx 13C 13D xx xx + s[0] = _mm_loadu_si128((const __m128i *)(&src[0])); + s[1] = _mm_loadu_si128((const __m128i *)(&src[16])); + s[2] = _mm_loadu_si128((const __m128i *)(&src[32])); + s[3] = _mm_loadu_si128((const __m128i *)(&src[48])); + s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0)); + s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16)); + s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32)); + s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48)); + + // 000 001 100 101 xx xx xx xx 004 005 104 105 xx xx xx xx + // 008 009 108 109 xx xx xx xx 00C 00D 10C 10D xx xx xx xx + // 010 011 110 111 xx xx xx xx 014 015 114 115 xx xx xx xx + // 018 019 118 119 xx xx xx xx 01C 01D 11C 11D xx xx xx xx + // 020 021 120 121 xx xx xx xx 024 025 124 125 xx xx xx xx + // 028 029 128 129 xx xx xx xx 02C 02D 12C 12D xx xx xx xx + // 030 031 130 131 xx xx xx xx 034 035 134 135 xx xx xx xx + // 038 039 138 139 xx xx xx xx 03C 03D 13C 13D xx xx xx xx + d[0] = _mm_unpacklo_epi16(s[0], s[4]); + d[1] = _mm_unpackhi_epi16(s[0], s[4]); + d[2] = _mm_unpacklo_epi16(s[1], s[5]); + d[3] = _mm_unpackhi_epi16(s[1], s[5]); + d[4] = _mm_unpacklo_epi16(s[2], s[6]); + d[5] = _mm_unpackhi_epi16(s[2], s[6]); + d[6] = _mm_unpacklo_epi16(s[3], s[7]); + d[7] = _mm_unpackhi_epi16(s[3], s[7]); + + // 000 001 100 101 008 009 108 109 xx xx xx xx xx xx xx xx + // 004 005 104 105 00C 00D 10C 10D xx xx xx xx xx xx xx xx + // 010 011 110 111 018 019 118 119 xx xx xx xx xx xx xx xx + // 014 015 114 115 01C 01D 11C 11D xx xx xx xx xx xx xx xx + // 020 021 120 121 028 029 128 129 xx xx xx xx xx xx xx xx + // 024 025 124 125 02C 02D 12C 12D xx xx xx xx xx xx xx xx + // 030 031 130 131 038 039 138 139 xx xx xx xx xx xx xx xx + // 034 035 134 135 03C 03D 13C 13D xx xx xx xx xx xx xx xx + s[0] = _mm_unpacklo_epi32(d[0], d[1]); + s[1] = _mm_unpackhi_epi32(d[0], d[1]); + s[2] = _mm_unpacklo_epi32(d[2], d[3]); + s[3] = _mm_unpackhi_epi32(d[2], d[3]); + s[4] = _mm_unpacklo_epi32(d[4], d[5]); + s[5] = _mm_unpackhi_epi32(d[4], d[5]); + s[6] = _mm_unpacklo_epi32(d[6], d[7]); + s[7] = _mm_unpackhi_epi32(d[6], d[7]); + + // 000 001 100 101 004 005 104 105 008 009 108 109 00C 00D 10C 10D + // 010 011 110 111 014 015 114 115 018 019 118 119 01C 01D 11C 11D + // 020 021 120 121 024 025 124 125 028 029 128 129 02C 02D 12C 12D + // 030 031 130 131 034 035 134 135 038 039 138 139 03C 03D 13C 13D + d[0] = _mm_unpacklo_epi32(s[0], s[1]); + d[1] = _mm_unpacklo_epi32(s[2], s[3]); + d[2] = _mm_unpacklo_epi32(s[4], s[5]); + d[3] = _mm_unpacklo_epi32(s[6], s[7]); + + d[0] = scale_plane_bilinear_kernel(&d[0], c0c1); + d[1] = scale_plane_bilinear_kernel(&d[2], c0c1); + + // Vertical + d[0] = scale_plane_bilinear_kernel(d, c0c1); + + _mm_storeu_si128((__m128i *)dst, d[0]); + src += 64; + dst += 16; + x -= 16; + } while (x); + src += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 3) & ~3; + const int width_ver = (w + 7) & ~7; + const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 3) & ~3; + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[11], d[4]; + __m128i f[4]; + + assert(w && h); + + shuffle_filter_ssse3(coef, f); + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1; + + // horizontal 4x8 + do { + load_8bit_8x8(src + 2, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped) + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[3]); + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + // 0C 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D + transpose_16bit_4x8(&s[3], &s[3]); + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70 + d[1] = convolve8_8_ssse3(&s[1], f); // 01 11 21 31 41 51 61 71 + d[2] = convolve8_8_ssse3(&s[2], f); // 02 12 22 32 42 52 62 72 + d[3] = convolve8_8_ssse3(&s[3], f); // 03 13 23 33 43 53 63 73 + + // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 + // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 + d[0] = _mm_packus_epi16(d[0], d[2]); + d[1] = _mm_packus_epi16(d[1], d[3]); + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73 + d[2] = _mm_unpacklo_epi16(d[0], d[1]); + d[3] = _mm_unpackhi_epi16(d[0], d[1]); + // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33 + // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73 + d[0] = _mm_unpacklo_epi32(d[2], d[3]); + d[1] = _mm_unpackhi_epi32(d[2], d[3]); + store_8bit_8x4_from_16x2(d, t, 2 * width_hor); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + + t += 8; + x -= 4; + } while (x); + src += 8 * src_stride - 2 * width_hor; + t += 6 * width_hor; + y -= 8; + } while (y); + + // vertical 8x4 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor)); + s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor)); + s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor)); + t += 6 * width_hor; + y = height_ver; + + do { + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77 + // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 77 + loadu_8bit_16x4(t, 2 * width_hor, &s[3]); + t += 8 * width_hor; + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07 + d[1] = convolve8_8_ssse3(&s[1], f); // 10 11 12 13 14 15 16 17 + d[2] = convolve8_8_ssse3(&s[2], f); // 20 21 22 23 24 25 26 27 + d[3] = convolve8_8_ssse3(&s[3], f); // 30 31 32 33 34 35 36 37 + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 + d[0] = _mm_packus_epi16(d[0], d[1]); + d[1] = _mm_packus_epi16(d[2], d[3]); + store_8bit_8x4_from_16x2(d, dst, dst_stride); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + + dst += 4 * dst_stride; + y -= 4; + } while (y); + t -= width_hor * (2 * height_ver + 6); + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 1) & ~1; + const int width_ver = (w + 7) & ~7; + const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 1) & ~1; + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[11], d[4]; + __m128i f[4]; + + assert(w && h); + + shuffle_filter_ssse3(coef, f); + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3; + + // horizontal 2x8 + do { + load_8bit_8x8(src + 4, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 (overlapped) + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped) + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[2]); + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + transpose_16bit_4x8(&s[2], &s[2]); + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70 + d[1] = convolve8_8_ssse3(&s[2], f); // 01 11 21 31 41 51 61 71 + + // 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx + // 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx + d[0] = _mm_packus_epi16(d[0], d[0]); + d[1] = _mm_packus_epi16(d[1], d[1]); + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + d[0] = _mm_unpacklo_epi16(d[0], d[1]); + store_8bit_4x4_sse2(d[0], t, 2 * width_hor); + + s[0] = s[4]; + s[1] = s[5]; + + t += 4; + x -= 2; + } while (x); + src += 8 * src_stride - 4 * width_hor; + t += 6 * width_hor; + y -= 8; + } while (y); + + // vertical 8x2 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor)); + s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor)); + t += 4 * width_hor; + y = height_ver; + + do { + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77 + loadu_8bit_16x4(t, 2 * width_hor, &s[2]); + t += 8 * width_hor; + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07 + d[1] = convolve8_8_ssse3(&s[2], f); // 10 11 12 13 14 15 16 17 + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + d[0] = _mm_packus_epi16(d[0], d[1]); + _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]); + _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]); + + s[0] = s[4]; + s[1] = s[5]; + + dst += 2 * dst_stride; + y -= 2; + } while (y); + t -= width_hor * (4 * height_ver + 4); + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +typedef void (*shuffle_filter_funcs)(const int16_t *const filter, + __m128i *const f); + +typedef __m128i (*convolve8_funcs)(const __m128i *const s, + const __m128i *const f); + +static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const InterpKernel *const coef, + const int phase_scaler, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = 2 * width_hor + 4; // store 4 extra pixels + const int width_ver = (w + 7) & ~7; + // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows + // above and (SUBPEL_TAPS / 2) extra rows below. + const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[12], d[6], dd[4]; + __m128i f0[4], f1[5], f2[5]; + // The offset of the first row is always less than 1 pixel. + const int offset1_q4 = phase_scaler + 1 * step_q4; + const int offset2_q4 = phase_scaler + 2 * step_q4; + // offset_idxx indicates the pixel offset is even (0) or odd (1). + // It's used to choose the src offset and filter coefficient offset. + const int offset_idx1 = (offset1_q4 >> 4) & 1; + const int offset_idx2 = (offset2_q4 >> 4) & 1; + static const shuffle_filter_funcs shuffle_filter_funcs[2] = { + shuffle_filter_ssse3, shuffle_filter_odd_ssse3 + }; + static const convolve8_funcs convolve8_funcs[2] = { + convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3 + }; + + assert(w && h); + + shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0); + shuffle_filter_funcs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1); + shuffle_filter_funcs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2); + + // Sub 64 to avoid overflow. + // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here. + // Coef 128 is in either fx[1] or fx[2] depending on the phase idx. + // When filter phase idx is 1, the two biggest coefficients are shuffled + // together, and the sum of them are always no less than 128. Sub 64 here. + // After the subtraction, when the sum of all positive coefficients are no + // larger than 128, and the sum of all negative coefficients are no + // less than -128, there will be no overflow in the convolve8 functions. + f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64)); + f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64)); + f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64)); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1; + + // horizontal 6x8 + do { + load_8bit_8x8(src, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[4]); + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + // OC 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D + // 0E 0F 1E 1F 2E 2F 3E 3F 4E 4F 5E 5F 6E 6F 7E 7F + transpose_16bit_4x8(&s[4], &s[4]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); + d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2); + d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); + d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + + // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 + // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx + // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx + dd[0] = _mm_packus_epi16(d[0], d[2]); + dd[1] = _mm_packus_epi16(d[1], d[3]); + dd[2] = _mm_packus_epi16(d[4], d[4]); + dd[3] = _mm_packus_epi16(d[5], d[5]); + + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73 + // 04 14 05 15 24 34 25 35 44 54 45 55 64 74 65 75 + d[0] = _mm_unpacklo_epi16(dd[0], dd[1]); + d[1] = _mm_unpackhi_epi16(dd[0], dd[1]); + d[2] = _mm_unpacklo_epi16(dd[2], dd[3]); + + // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33 + // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73 + // 04 14 05 15 xx xx xx xx 24 34 25 35 xx xx xx xx + // 44 54 45 55 xx xx xx xx 64 74 65 75 xx xx xx xx + dd[0] = _mm_unpacklo_epi32(d[0], d[1]); + dd[1] = _mm_unpackhi_epi32(d[0], d[1]); + dd[2] = _mm_unpacklo_epi32(d[2], d[2]); + dd[3] = _mm_unpackhi_epi32(d[2], d[2]); + + // 00 10 01 11 02 12 03 13 04 14 05 15 xx xx xx xx + // 20 30 21 31 22 32 23 33 24 34 25 35 xx xx xx xx + // 40 50 41 51 42 52 43 53 44 54 45 55 xx xx xx xx + // 60 70 61 71 62 72 63 73 64 74 65 75 xx xx xx xx + d[0] = _mm_unpacklo_epi64(dd[0], dd[2]); + d[1] = _mm_unpackhi_epi64(dd[0], dd[2]); + d[2] = _mm_unpacklo_epi64(dd[1], dd[3]); + d[3] = _mm_unpackhi_epi64(dd[1], dd[3]); + + // store 4 extra pixels + storeu_8bit_16x4(d, t, stride_hor); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + s[3] = s[7]; + + t += 12; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3; + t += 3 * stride_hor + 4; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + loadu_8bit_16x4(t, stride_hor, s); + y = height_ver; + + do { + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 97 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 B7 + // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 D7 + // E0 F0 E1 F1 E2 F2 E3 F3 E4 F4 E5 F5 E6 F6 E7 F7 + t += 4 * stride_hor; + loadu_8bit_16x4(t, stride_hor, &s[4]); + + d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); + d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2); + d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); + d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57 + d[0] = _mm_packus_epi16(d[0], d[1]); + d[2] = _mm_packus_epi16(d[2], d[3]); + d[4] = _mm_packus_epi16(d[4], d[5]); + + _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]); + _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]); + _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]); + _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + s[3] = s[7]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * 2 * height_ver / 3; + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s, + const __m128i *const f) { + __m128i ss[4], temp; + + ss[0] = _mm_unpacklo_epi8(s[0], s[1]); + ss[1] = _mm_unpacklo_epi8(s[2], s[3]); + ss[2] = _mm_unpacklo_epi8(s[4], s[5]); + ss[3] = _mm_unpacklo_epi8(s[6], s[7]); + temp = convolve8_8_ssse3(ss, f); + return _mm_packus_epi16(temp, temp); +} + +// Only calculate odd columns since even columns are just src pixels' copies. +static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst, + const int w, const __m128i *const f) { + int x = w; + + do { + __m128i s[8], temp; + s[0] = _mm_loadl_epi64((const __m128i *)(src + 0)); + s[1] = _mm_loadl_epi64((const __m128i *)(src + 1)); + s[2] = _mm_loadl_epi64((const __m128i *)(src + 2)); + s[3] = _mm_loadl_epi64((const __m128i *)(src + 3)); + s[4] = _mm_loadl_epi64((const __m128i *)(src + 4)); + s[5] = _mm_loadl_epi64((const __m128i *)(src + 5)); + s[6] = _mm_loadl_epi64((const __m128i *)(src + 6)); + s[7] = _mm_loadl_epi64((const __m128i *)(src + 7)); + temp = scale_1_to_2_phase_0_kernel(s, f); + _mm_storel_epi64((__m128i *)dst, temp); + src += 8; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_1_to_2_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int src_w, const int src_h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + int max_width; + int y; + uint8_t *tmp[9]; + __m128i f[4]; + + max_width = (src_w + 7) & ~7; + tmp[0] = temp_buffer + 0 * max_width; + tmp[1] = temp_buffer + 1 * max_width; + tmp[2] = temp_buffer + 2 * max_width; + tmp[3] = temp_buffer + 3 * max_width; + tmp[4] = temp_buffer + 4 * max_width; + tmp[5] = temp_buffer + 5 * max_width; + tmp[6] = temp_buffer + 6 * max_width; + tmp[7] = temp_buffer + 7 * max_width; + + shuffle_filter_ssse3(coef, f); + + scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f); + scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f); + scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f); + scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f); + scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f); + scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f); + scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f); + + y = src_h; + do { + int x; + scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f); + for (x = 0; x < max_width; x += 8) { + __m128i s[8], C, D, CD; + + // Even rows + const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x)); + const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x)); + const __m128i ab = _mm_unpacklo_epi8(a, b); + _mm_storeu_si128((__m128i *)(dst + 2 * x), ab); + + // Odd rows + // Even columns + load_8bit_8x8(src + x - 3 * src_stride, src_stride, s); + C = scale_1_to_2_phase_0_kernel(s, f); + + // Odd columns + s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x)); + s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x)); + s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x)); + s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x)); + s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x)); + s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x)); + s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x)); + s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x)); + D = scale_1_to_2_phase_0_kernel(s, f); + + CD = _mm_unpacklo_epi8(C, D); + _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD); } - } + + src += src_stride; + dst += 2 * dst_stride; + tmp[8] = tmp[0]; + tmp[0] = tmp[1]; + tmp[1] = tmp[2]; + tmp[2] = tmp[3]; + tmp[3] = tmp[4]; + tmp[4] = tmp[5]; + tmp[5] = tmp[6]; + tmp[6] = tmp[7]; + tmp[7] = tmp[8]; + } while (--y); } void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, @@ -177,30 +756,152 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, const int dst_h = dst->y_crop_height; const int dst_uv_w = dst_w / 2; const int dst_uv_h = dst_h / 2; + int scaled = 0; - if (dst_w * 2 == src_w && dst_h * 2 == src_h && phase_scaler == 0) { - downsample_2_to_1_ssse3(src->y_buffer, src->y_stride, dst->y_buffer, - dst->y_stride, dst_w, dst_h); - downsample_2_to_1_ssse3(src->u_buffer, src->uv_stride, dst->u_buffer, - dst->uv_stride, dst_uv_w, dst_uv_h); - downsample_2_to_1_ssse3(src->v_buffer, src->uv_stride, dst->v_buffer, - dst->uv_stride, dst_uv_w, dst_uv_h); - vpx_extend_frame_borders(dst); - } else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) { - // The upsample() supports widths up to 1920 * 2. If greater, fall back - // to vp9_scale_and_extend_frame_c(). - if (dst_w / 2 <= 1920) { - upsample_1_to_2_ssse3(src->y_buffer, src->y_stride, dst->y_buffer, - dst->y_stride, dst_w, dst_h); - upsample_1_to_2_ssse3(src->u_buffer, src->uv_stride, dst->u_buffer, - dst->uv_stride, dst_uv_w, dst_uv_h); - upsample_1_to_2_ssse3(src->v_buffer, src->uv_stride, dst->v_buffer, - dst->uv_stride, dst_uv_w, dst_uv_h); - vpx_extend_frame_borders(dst); + // phase_scaler is usually 0 or 8. + assert(phase_scaler >= 0 && phase_scaler < 16); + + if (dst_w * 2 == src_w && dst_h * 2 == src_h) { + // 2 to 1 + scaled = 1; + + if (phase_scaler == 0) { + scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h); + scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + } else if (filter_type == BILINEAR) { + const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3]; + const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4]; + const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0 + scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, c0c1); + scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0c1); + scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0c1); + } else { + const int buffer_stride = (dst_w + 3) & ~3; + const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (temp_buffer) { + scale_plane_2_to_1_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer); + scale_plane_2_to_1_general( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + scale_plane_2_to_1_general( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + free(temp_buffer); + } else { + scaled = 0; + } + } + } else if (4 * dst_w == src_w && 4 * dst_h == src_h) { + // 4 to 1 + scaled = 1; + if (phase_scaler == 0) { + scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h); + scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + } else if (filter_type == BILINEAR) { + const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3]; + const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4]; + const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0 + scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, c0c1); + scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0c1); + scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0c1); } else { - vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler); + const int buffer_stride = (dst_w + 1) & ~1; + const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7; + // When dst_w is 1 or 2, we need extra padding to avoid heap read overflow + const int extra_padding = 16; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding); + if (temp_buffer) { + scale_plane_4_to_1_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer); + scale_plane_4_to_1_general( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + scale_plane_4_to_1_general( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + free(temp_buffer); + } else { + scaled = 0; + } + } + } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) { + // 4 to 3 + const int buffer_stride_hor = (dst_w + 5) - ((dst_w + 5) % 6) + 2; + const int buffer_stride_ver = (dst_w + 7) & ~7; + const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + // When the vertical filter reads more pixels than the horizontal filter + // generated in each row, we need extra padding to avoid heap read overflow. + // For example, the horizontal filter generates 18 pixels but the vertical + // filter reads 24 pixels in a row. The difference is multiplied by 2 since + // two rows are interlaced together in the optimization. + const int extra_padding = (buffer_stride_ver > buffer_stride_hor) + ? 2 * (buffer_stride_ver - buffer_stride_hor) + : 0; + const int buffer_size = buffer_stride_hor * buffer_height + extra_padding; + uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size); + if (temp_buffer) { + scaled = 1; + scale_plane_4_to_3_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer); + scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], phase_scaler, + temp_buffer); + scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], phase_scaler, + temp_buffer); + free(temp_buffer); + } + } else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) { + // 1 to 2 + uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_w + 7) & ~7)); + if (temp_buffer) { + scaled = 1; + scale_plane_1_to_2_phase_0( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src_w, + src_h, vp9_filter_kernels[filter_type][8], temp_buffer); + scale_plane_1_to_2_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, src_w / 2, src_h / 2, + vp9_filter_kernels[filter_type][8], + temp_buffer); + scale_plane_1_to_2_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, src_w / 2, src_h / 2, + vp9_filter_kernels[filter_type][8], + temp_buffer); + free(temp_buffer); } + } + + if (scaled) { + vpx_extend_frame_borders(dst); } else { + // Call c version for all other scaling ratios. vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler); } } diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c index 4a2581a34..ca0ad4407 100644 --- a/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <assert.h> #include <emmintrin.h> #include <xmmintrin.h> @@ -25,8 +26,12 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i zero; __m128i thr; int16_t nzflag; + __m128i eob; + __m128i round, quant, dequant; (void)scan_ptr; + (void)skip_block; + assert(!skip_block); coeff_ptr += n_coeffs; iscan_ptr += n_coeffs; @@ -35,40 +40,106 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, n_coeffs = -n_coeffs; zero = _mm_setzero_si128(); - if (!skip_block) { - __m128i eob; - __m128i round, quant, dequant; + { + __m128i coeff0, coeff1; + + // Setup global values { - __m128i coeff0, coeff1; + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + } - // Setup global values - { - round = _mm_load_si128((const __m128i *)round_ptr); - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - } + { + __m128i coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i qtmp0, qtmp1; + // Do DC and first 15 AC + coeff0 = load_tran_low(coeff_ptr + n_coeffs); + coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); + + // Poor man's sign extract + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + round = _mm_unpackhi_epi64(round, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); + quant = _mm_unpackhi_epi64(quant, quant); + qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); + store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); + + coeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); + store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); + } - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - // Do DC and first 15 AC - coeff0 = load_tran_low(coeff_ptr + n_coeffs); - coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + { + // Scan for eob + __m128i zero_coeff0, zero_coeff1; + __m128i nzero_coeff0, nzero_coeff1; + __m128i iscan0, iscan1; + __m128i eob1; + zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); + iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); + eob = _mm_and_si128(iscan0, nzero_coeff0); + eob1 = _mm_and_si128(iscan1, nzero_coeff1); + eob = _mm_max_epi16(eob, eob1); + } + n_coeffs += 8 * 2; + } + thr = _mm_srai_epi16(dequant, 1); + + // AC only loop + while (n_coeffs < 0) { + __m128i coeff0, coeff1; + { + __m128i coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i qtmp0, qtmp1; + + coeff0 = load_tran_low(coeff_ptr + n_coeffs); + coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); + + // Poor man's sign extract + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + + if (nzflag) { qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); // Reinsert signs @@ -81,131 +152,51 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); - } + } else { + store_zero_tran_low(qcoeff_ptr + n_coeffs); + store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob = _mm_max_epi16(eob, eob1); + store_zero_tran_low(dqcoeff_ptr + n_coeffs); + store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); } - n_coeffs += 8 * 2; } - thr = _mm_srai_epi16(dequant, 1); - - // AC only loop - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - - coeff0 = load_tran_low(coeff_ptr + n_coeffs); - coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | - _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); - - if (nzflag) { - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); - store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); - store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); - } else { - store_zero_tran_low(qcoeff_ptr + n_coeffs); - store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); - - store_zero_tran_low(dqcoeff_ptr + n_coeffs); - store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); - } - } - - if (nzflag) { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob0 = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob0 = _mm_max_epi16(eob0, eob1); - eob = _mm_max_epi16(eob, eob0); - } - n_coeffs += 8 * 2; + if (nzflag) { + // Scan for eob + __m128i zero_coeff0, zero_coeff1; + __m128i nzero_coeff0, nzero_coeff1; + __m128i iscan0, iscan1; + __m128i eob0, eob1; + zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); + iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); + eob0 = _mm_and_si128(iscan0, nzero_coeff0); + eob1 = _mm_and_si128(iscan1, nzero_coeff1); + eob0 = _mm_max_epi16(eob0, eob1); + eob = _mm_max_epi16(eob, eob0); } + n_coeffs += 8 * 2; + } - // Accumulate EOB - { - __m128i eob_shuffled; - eob_shuffled = _mm_shuffle_epi32(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); - eob = _mm_max_epi16(eob, eob_shuffled); - *eob_ptr = _mm_extract_epi16(eob, 1); - } - } else { - do { - store_zero_tran_low(qcoeff_ptr + n_coeffs); - store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); - - store_zero_tran_low(dqcoeff_ptr + n_coeffs); - store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); - n_coeffs += 8 * 2; - } while (n_coeffs < 0); - *eob_ptr = 0; + // Accumulate EOB + { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + *eob_ptr = _mm_extract_epi16(eob, 1); } } diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm index 1b88863f6..5703aa3bb 100644 --- a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm +++ b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm @@ -22,8 +22,6 @@ SECTION .text cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, round, quant, \ qcoeff, dqcoeff, dequant, \ eob, scan, iscan - cmp dword skipm, 0 - jne .blank ; actual quantize loop - setup pointers, rounders, etc. movifnidn coeffq, coeffmp @@ -171,28 +169,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, round, quant, \ pshuflw m7, m8, 0x1 pmaxsw m8, m7 pextrw r6, m8, 0 - mov [r2], r6 - RET - - ; skip-block, i.e. just write all zeroes -.blank: - mov r0, dqcoeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, qcoeffmp - mov r3, eobmp - - lea r0q, [r0q+ncoeffq*2] - lea r2q, [r2q+ncoeffq*2] - neg ncoeffq - pxor m7, m7 -.blank_loop: - STORE_ZERO_TRAN_LOW 7, r0q, ncoeffq - STORE_ZERO_TRAN_LOW 7, r0q, ncoeffq + 8 - STORE_ZERO_TRAN_LOW 7, r2q, ncoeffq - STORE_ZERO_TRAN_LOW 7, r2q, ncoeffq + 8 - add ncoeffq, mmsize - jl .blank_loop - mov word [r3q], 0 + mov [r2], r6w RET %endmacro diff --git a/libvpx/vp9/vp9_cx_iface.c b/libvpx/vp9/vp9_cx_iface.c index 25fc80a9a..881caae78 100644 --- a/libvpx/vp9/vp9_cx_iface.c +++ b/libvpx/vp9/vp9_cx_iface.c @@ -171,12 +171,17 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100); RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100); RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100); + RANGE_CHECK(cfg, rc_2pass_vbr_corpus_complexity, 0, 10000); RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO); RANGE_CHECK_BOOL(cfg, rc_resize_allowed); RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100); RANGE_CHECK_HI(cfg, rc_resize_up_thresh, 100); RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100); +#if CONFIG_REALTIME_ONLY + RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_ONE_PASS); +#else RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_LAST_PASS); +#endif RANGE_CHECK(extra_cfg, min_gf_interval, 0, (MAX_LAG_BUFFERS - 1)); RANGE_CHECK(extra_cfg, max_gf_interval, 0, (MAX_LAG_BUFFERS - 1)); if (extra_cfg->max_gf_interval > 0) { @@ -187,6 +192,13 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, (MAX_LAG_BUFFERS - 1)); } + // For formation of valid ARF groups lag_in _frames should be 0 or greater + // than the max_gf_interval + 2 + if (cfg->g_lag_in_frames > 0 && extra_cfg->max_gf_interval > 0 && + cfg->g_lag_in_frames < extra_cfg->max_gf_interval + 2) { + ERROR("Set lag in frames to 0 (low delay) or >= (max-gf-interval + 2)"); + } + if (cfg->rc_resize_allowed == 1) { RANGE_CHECK(cfg, rc_scaled_width, 0, cfg->g_w); RANGE_CHECK(cfg, rc_scaled_height, 0, cfg->g_h); @@ -202,7 +214,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, level != LEVEL_4 && level != LEVEL_4_1 && level != LEVEL_5 && level != LEVEL_5_1 && level != LEVEL_5_2 && level != LEVEL_6 && level != LEVEL_6_1 && level != LEVEL_6_2 && level != LEVEL_UNKNOWN && - level != LEVEL_MAX) + level != LEVEL_AUTO && level != LEVEL_MAX) ERROR("target_level is invalid"); } @@ -269,6 +281,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, if (extra_cfg->tuning == VP8_TUNE_SSIM) ERROR("Option --tune=ssim is not currently supported in VP9."); +#if !CONFIG_REALTIME_ONLY if (cfg->g_pass == VPX_RC_LAST_PASS) { const size_t packet_sz = sizeof(FIRSTPASS_STATS); const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz); @@ -320,6 +333,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, ERROR("rc_twopass_stats_in missing EOS stats packet"); } } +#endif // !CONFIG_REALTIME_ONLY #if !CONFIG_VP9_HIGHBITDEPTH if (cfg->g_profile > (unsigned int)PROFILE_1) { @@ -425,10 +439,20 @@ static void config_target_level(VP9EncoderConfig *oxcf) { oxcf->worst_allowed_q = vp9_quantizer_to_qindex(63); // Adjust minimum art-ref distance. - if (oxcf->min_gf_interval < - (int)vp9_level_defs[target_level_index].min_altref_distance) + // min_gf_interval should be no less than min_altref_distance + 1, + // as the encoder may produce bitstream with alt-ref distance being + // min_gf_interval - 1. + if (oxcf->min_gf_interval <= + (int)vp9_level_defs[target_level_index].min_altref_distance) { oxcf->min_gf_interval = - (int)vp9_level_defs[target_level_index].min_altref_distance; + (int)vp9_level_defs[target_level_index].min_altref_distance + 1; + // If oxcf->max_gf_interval == 0, it will be assigned with a default value + // in vp9_rc_set_gf_interval_range(). + if (oxcf->max_gf_interval != 0) { + oxcf->max_gf_interval = + VPXMAX(oxcf->max_gf_interval, oxcf->min_gf_interval); + } + } // Adjust maximum column tiles. if (vp9_level_defs[target_level_index].max_col_tiles < @@ -503,6 +527,7 @@ static vpx_codec_err_t set_encoder_config( oxcf->two_pass_vbrbias = cfg->rc_2pass_vbr_bias_pct; oxcf->two_pass_vbrmin_section = cfg->rc_2pass_vbr_minsection_pct; oxcf->two_pass_vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct; + oxcf->vbr_corpus_complexity = cfg->rc_2pass_vbr_corpus_complexity; oxcf->auto_key = cfg->kf_mode == VPX_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist; @@ -613,6 +638,7 @@ static vpx_codec_err_t set_encoder_config( printf("two_pass_vbrbias: %d\n", oxcf->two_pass_vbrbias); printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section); printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section); + printf("vbr_corpus_complexity: %d\n", oxcf->vbr_corpus_complexity); printf("lag_in_frames: %d\n", oxcf->lag_in_frames); printf("enable_auto_arf: %d\n", oxcf->enable_auto_arf); printf("Version: %d\n", oxcf->Version); @@ -888,12 +914,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, priv->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool)); if (priv->buffer_pool == NULL) return VPX_CODEC_MEM_ERROR; -#if CONFIG_MULTITHREAD - if (pthread_mutex_init(&priv->buffer_pool->pool_mutex, NULL)) { - return VPX_CODEC_MEM_ERROR; - } -#endif - if (ctx->config.enc) { // Update the reference to the config structure to an internal copy. priv->cfg = *ctx->config.enc; @@ -925,9 +945,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) { free(ctx->cx_data); vp9_remove_compressor(ctx->cpi); -#if CONFIG_MULTITHREAD - pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex); -#endif vpx_free(ctx->buffer_pool); vpx_free(ctx); return VPX_CODEC_OK; @@ -938,6 +955,10 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, unsigned long deadline) { MODE new_mode = BEST; +#if CONFIG_REALTIME_ONLY + (void)duration; + deadline = VPX_DL_REALTIME; +#else switch (ctx->cfg.g_pass) { case VPX_RC_ONE_PASS: if (deadline > 0) { @@ -958,6 +979,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, case VPX_RC_FIRST_PASS: break; case VPX_RC_LAST_PASS: new_mode = deadline > 0 ? GOOD : BEST; break; } +#endif // CONFIG_REALTIME_ONLY if (deadline == VPX_DL_REALTIME) { ctx->oxcf.pass = 0; @@ -1266,8 +1288,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, cx_data += size; cx_data_sz -= size; -#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) -#if CONFIG_SPATIAL_SVC +#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC) if (cpi->use_svc && !ctx->output_cx_pkt_cb.output_cx_pkt) { vpx_codec_cx_pkt_t pkt_sizes, pkt_psnr; int sl; @@ -1288,7 +1309,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_psnr); } #endif -#endif if (is_one_pass_cbr_svc(cpi) && (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { // Encoded all spatial layers; exit loop. @@ -1679,6 +1699,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { 50, // rc_two_pass_vbrbias 0, // rc_two_pass_vbrmin_section 2000, // rc_two_pass_vbrmax_section + 0, // rc_2pass_vbr_corpus_complexity (non 0 for corpus vbr) // keyframing settings (kf) VPX_KF_AUTO, // g_kfmode diff --git a/libvpx/vp9/vp9_dx_iface.c b/libvpx/vp9/vp9_dx_iface.c index 1da1794b7..657490f4b 100644 --- a/libvpx/vp9/vp9_dx_iface.c +++ b/libvpx/vp9/vp9_dx_iface.c @@ -47,9 +47,6 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx, ctx->priv->init_flags = ctx->init_flags; priv->si.sz = sizeof(priv->si); priv->flushed = 0; - // TODO(jzern): remnants of frame-level parallel decoding should be - // removed. cf., https://bugs.chromium.org/p/webm/issues/detail?id=1395 - priv->frame_parallel_decode = 0; if (ctx->config.dec) { priv->cfg = *ctx->config.dec; ctx->config.dec = &priv->cfg; @@ -60,33 +57,8 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx, } static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) { - if (ctx->frame_workers != NULL) { - int i; - // Shutdown all threads before reclaiming any memory. The frame-level - // parallel decoder may access data from another worker. - for (i = 0; i < ctx->num_frame_workers; ++i) { - VPxWorker *const worker = &ctx->frame_workers[i]; - vpx_get_worker_interface()->end(worker); - } - for (i = 0; i < ctx->num_frame_workers; ++i) { - VPxWorker *const worker = &ctx->frame_workers[i]; - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - vp9_remove_common(&frame_worker_data->pbi->common); -#if CONFIG_VP9_POSTPROC - vp9_free_postproc_buffers(&frame_worker_data->pbi->common); -#endif - vp9_decoder_remove(frame_worker_data->pbi); - vpx_free(frame_worker_data->scratch_buffer); -#if CONFIG_MULTITHREAD - pthread_mutex_destroy(&frame_worker_data->stats_mutex); - pthread_cond_destroy(&frame_worker_data->stats_cond); -#endif - vpx_free(frame_worker_data); - } -#if CONFIG_MULTITHREAD - pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex); -#endif + if (ctx->pbi != NULL) { + vp9_decoder_remove(ctx->pbi); } if (ctx->buffer_pool) { @@ -94,7 +66,6 @@ static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) { vp9_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers); } - vpx_free(ctx->frame_workers); vpx_free(ctx->buffer_pool); vpx_free(ctx); return VPX_CODEC_OK; @@ -228,32 +199,26 @@ static vpx_codec_err_t update_error_state( } static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) { - int i; - - for (i = 0; i < ctx->num_frame_workers; ++i) { - VPxWorker *const worker = &ctx->frame_workers[i]; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - VP9_COMMON *const cm = &frame_worker_data->pbi->common; - BufferPool *const pool = cm->buffer_pool; - - cm->new_fb_idx = INVALID_IDX; - cm->byte_alignment = ctx->byte_alignment; - cm->skip_loop_filter = ctx->skip_loop_filter; - - if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) { - pool->get_fb_cb = ctx->get_ext_fb_cb; - pool->release_fb_cb = ctx->release_ext_fb_cb; - pool->cb_priv = ctx->ext_priv; - } else { - pool->get_fb_cb = vp9_get_frame_buffer; - pool->release_fb_cb = vp9_release_frame_buffer; + VP9_COMMON *const cm = &ctx->pbi->common; + BufferPool *const pool = cm->buffer_pool; - if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers)) - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to initialize internal frame buffers"); + cm->new_fb_idx = INVALID_IDX; + cm->byte_alignment = ctx->byte_alignment; + cm->skip_loop_filter = ctx->skip_loop_filter; - pool->cb_priv = &pool->int_frame_buffers; - } + if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) { + pool->get_fb_cb = ctx->get_ext_fb_cb; + pool->release_fb_cb = ctx->release_ext_fb_cb; + pool->cb_priv = ctx->ext_priv; + } else { + pool->get_fb_cb = vp9_get_frame_buffer; + pool->release_fb_cb = vp9_release_frame_buffer; + + if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to initialize internal frame buffers"); + + pool->cb_priv = &pool->int_frame_buffers; } } @@ -270,124 +235,21 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx, vp9_ppflags_t *flags) { flags->noise_level = ctx->postproc_cfg.noise_level; } -static int frame_worker_hook(void *arg1, void *arg2) { - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1; - const uint8_t *data = frame_worker_data->data; - (void)arg2; - - frame_worker_data->result = vp9_receive_compressed_data( - frame_worker_data->pbi, frame_worker_data->data_size, &data); - frame_worker_data->data_end = data; - - if (frame_worker_data->pbi->frame_parallel_decode) { - // In frame parallel decoding, a worker thread must successfully decode all - // the compressed data. - if (frame_worker_data->result != 0 || - frame_worker_data->data + frame_worker_data->data_size - 1 > data) { - VPxWorker *const worker = frame_worker_data->pbi->frame_worker_owner; - BufferPool *const pool = frame_worker_data->pbi->common.buffer_pool; - // Signal all the other threads that are waiting for this frame. - vp9_frameworker_lock_stats(worker); - frame_worker_data->frame_context_ready = 1; - lock_buffer_pool(pool); - frame_worker_data->pbi->cur_buf->buf.corrupted = 1; - unlock_buffer_pool(pool); - frame_worker_data->pbi->need_resync = 1; - vp9_frameworker_signal_stats(worker); - vp9_frameworker_unlock_stats(worker); - return 0; - } - } else if (frame_worker_data->result != 0) { - // Check decode result in serial decode. - frame_worker_data->pbi->cur_buf->buf.corrupted = 1; - frame_worker_data->pbi->need_resync = 1; - } - return !frame_worker_data->result; -} - static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { - int i; - const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); - ctx->last_show_frame = -1; - ctx->next_submit_worker_id = 0; - ctx->last_submit_worker_id = 0; - ctx->next_output_worker_id = 0; - ctx->frame_cache_read = 0; - ctx->frame_cache_write = 0; - ctx->num_cache_frames = 0; ctx->need_resync = 1; - ctx->num_frame_workers = - (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads : 1; - if (ctx->num_frame_workers > MAX_DECODE_THREADS) - ctx->num_frame_workers = MAX_DECODE_THREADS; - ctx->available_threads = ctx->num_frame_workers; ctx->flushed = 0; ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool)); if (ctx->buffer_pool == NULL) return VPX_CODEC_MEM_ERROR; -#if CONFIG_MULTITHREAD - if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) { - set_error_detail(ctx, "Failed to allocate buffer pool mutex"); + ctx->pbi = vp9_decoder_create(ctx->buffer_pool); + if (ctx->pbi == NULL) { + set_error_detail(ctx, "Failed to allocate decoder"); return VPX_CODEC_MEM_ERROR; } -#endif - - ctx->frame_workers = (VPxWorker *)vpx_malloc(ctx->num_frame_workers * - sizeof(*ctx->frame_workers)); - if (ctx->frame_workers == NULL) { - set_error_detail(ctx, "Failed to allocate frame_workers"); - return VPX_CODEC_MEM_ERROR; - } - - for (i = 0; i < ctx->num_frame_workers; ++i) { - VPxWorker *const worker = &ctx->frame_workers[i]; - FrameWorkerData *frame_worker_data = NULL; - winterface->init(worker); - worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData)); - if (worker->data1 == NULL) { - set_error_detail(ctx, "Failed to allocate frame_worker_data"); - return VPX_CODEC_MEM_ERROR; - } - frame_worker_data = (FrameWorkerData *)worker->data1; - frame_worker_data->pbi = vp9_decoder_create(ctx->buffer_pool); - if (frame_worker_data->pbi == NULL) { - set_error_detail(ctx, "Failed to allocate frame_worker_data"); - return VPX_CODEC_MEM_ERROR; - } - frame_worker_data->pbi->frame_worker_owner = worker; - frame_worker_data->worker_id = i; - frame_worker_data->scratch_buffer = NULL; - frame_worker_data->scratch_buffer_size = 0; - frame_worker_data->frame_context_ready = 0; - frame_worker_data->received_frame = 0; -#if CONFIG_MULTITHREAD - if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) { - set_error_detail(ctx, "Failed to allocate frame_worker_data mutex"); - return VPX_CODEC_MEM_ERROR; - } - - if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) { - set_error_detail(ctx, "Failed to allocate frame_worker_data cond"); - return VPX_CODEC_MEM_ERROR; - } -#endif - // If decoding in serial mode, FrameWorker thread could create tile worker - // thread or loopfilter thread. - frame_worker_data->pbi->max_threads = - (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0; - - frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order; - frame_worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode; - frame_worker_data->pbi->common.frame_parallel_decode = - ctx->frame_parallel_decode; - worker->hook = (VPxWorkerHook)frame_worker_hook; - if (!winterface->reset(worker)) { - set_error_detail(ctx, "Frame Worker thread creation failed"); - return VPX_CODEC_MEM_ERROR; - } - } + ctx->pbi->max_threads = ctx->cfg.threads; + ctx->pbi->inv_tile_order = ctx->invert_tile_order; // If postprocessing was enabled by the application and a // configuration has not been provided, default it. @@ -401,7 +263,7 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx, const VP9Decoder *const pbi) { - // Clear resync flag if worker got a key frame or intra only frame. + // Clear resync flag if the decoder got a key frame or intra only frame. if (ctx->need_resync == 1 && pbi->need_resync == 0 && (pbi->common.intra_only || pbi->common.frame_type == KEY_FRAME)) ctx->need_resync = 0; @@ -410,7 +272,6 @@ static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx, static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, const uint8_t **data, unsigned int data_sz, void *user_priv, int64_t deadline) { - const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); (void)deadline; // Determine the stream parameters. Note that we rely on peek_si to @@ -426,101 +287,23 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, if (!ctx->si.is_kf && !is_intra_only) return VPX_CODEC_ERROR; } - if (!ctx->frame_parallel_decode) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - frame_worker_data->data = *data; - frame_worker_data->data_size = data_sz; - frame_worker_data->user_priv = user_priv; - frame_worker_data->received_frame = 1; - - // Set these even if already initialized. The caller may have changed the - // decrypt config between frames. - frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb; - frame_worker_data->pbi->decrypt_state = ctx->decrypt_state; - - worker->had_error = 0; - winterface->execute(worker); + ctx->user_priv = user_priv; - // Update data pointer after decode. - *data = frame_worker_data->data_end; + // Set these even if already initialized. The caller may have changed the + // decrypt config between frames. + ctx->pbi->decrypt_cb = ctx->decrypt_cb; + ctx->pbi->decrypt_state = ctx->decrypt_state; - if (worker->had_error) - return update_error_state(ctx, &frame_worker_data->pbi->common.error); - - check_resync(ctx, frame_worker_data->pbi); - } else { - VPxWorker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id]; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - // Copy context from last worker thread to next worker thread. - if (ctx->next_submit_worker_id != ctx->last_submit_worker_id) - vp9_frameworker_copy_context( - &ctx->frame_workers[ctx->next_submit_worker_id], - &ctx->frame_workers[ctx->last_submit_worker_id]); - - frame_worker_data->pbi->ready_for_new_data = 0; - // Copy the compressed data into worker's internal buffer. - // TODO(hkuang): Will all the workers allocate the same size - // as the size of the first intra frame be better? This will - // avoid too many deallocate and allocate. - if (frame_worker_data->scratch_buffer_size < data_sz) { - vpx_free(frame_worker_data->scratch_buffer); - frame_worker_data->scratch_buffer = (uint8_t *)vpx_malloc(data_sz); - if (frame_worker_data->scratch_buffer == NULL) { - set_error_detail(ctx, "Failed to reallocate scratch buffer"); - return VPX_CODEC_MEM_ERROR; - } - frame_worker_data->scratch_buffer_size = data_sz; - } - frame_worker_data->data_size = data_sz; - memcpy(frame_worker_data->scratch_buffer, *data, data_sz); - - frame_worker_data->frame_decoded = 0; - frame_worker_data->frame_context_ready = 0; - frame_worker_data->received_frame = 1; - frame_worker_data->data = frame_worker_data->scratch_buffer; - frame_worker_data->user_priv = user_priv; - - if (ctx->next_submit_worker_id != ctx->last_submit_worker_id) - ctx->last_submit_worker_id = - (ctx->last_submit_worker_id + 1) % ctx->num_frame_workers; - - ctx->next_submit_worker_id = - (ctx->next_submit_worker_id + 1) % ctx->num_frame_workers; - --ctx->available_threads; - worker->had_error = 0; - winterface->launch(worker); + if (vp9_receive_compressed_data(ctx->pbi, data_sz, data)) { + ctx->pbi->cur_buf->buf.corrupted = 1; + ctx->pbi->need_resync = 1; + ctx->need_resync = 1; + return update_error_state(ctx, &ctx->pbi->common.error); } - return VPX_CODEC_OK; -} + check_resync(ctx, ctx->pbi); -static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) { - YV12_BUFFER_CONFIG sd; - vp9_ppflags_t flags = { 0, 0, 0 }; - const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); - VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - ctx->next_output_worker_id = - (ctx->next_output_worker_id + 1) % ctx->num_frame_workers; - // TODO(hkuang): Add worker error handling here. - winterface->sync(worker); - frame_worker_data->received_frame = 0; - ++ctx->available_threads; - - check_resync(ctx, frame_worker_data->pbi); - - if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) { - VP9_COMMON *const cm = &frame_worker_data->pbi->common; - RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; - ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx; - yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd, - frame_worker_data->user_priv); - ctx->frame_cache[ctx->frame_cache_write].img.fb_priv = - frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; - ctx->frame_cache_write = (ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE; - ++ctx->num_cache_frames; - } + return VPX_CODEC_OK; } static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, @@ -540,8 +323,8 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, // Reset flushed when receiving a valid frame. ctx->flushed = 0; - // Initialize the decoder workers on the first frame. - if (ctx->frame_workers == NULL) { + // Initialize the decoder on the first frame. + if (ctx->pbi == NULL) { const vpx_codec_err_t res = init_decoder(ctx); if (res != VPX_CODEC_OK) return res; } @@ -553,91 +336,37 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, if (ctx->svc_decoding && ctx->svc_spatial_layer < frame_count - 1) frame_count = ctx->svc_spatial_layer + 1; - if (ctx->frame_parallel_decode) { - // Decode in frame parallel mode. When decoding in this mode, the frame - // passed to the decoder must be either a normal frame or a superframe with - // superframe index so the decoder could get each frame's start position - // in the superframe. - if (frame_count > 0) { - int i; - - for (i = 0; i < frame_count; ++i) { - const uint8_t *data_start_copy = data_start; - const uint32_t frame_size = frame_sizes[i]; - if (data_start < data || - frame_size > (uint32_t)(data_end - data_start)) { - set_error_detail(ctx, "Invalid frame size in index"); - return VPX_CODEC_CORRUPT_FRAME; - } - - if (ctx->available_threads == 0) { - // No more threads for decoding. Wait until the next output worker - // finishes decoding. Then copy the decoded frame into cache. - if (ctx->num_cache_frames < FRAME_CACHE_SIZE) { - wait_worker_and_cache_frame(ctx); - } else { - // TODO(hkuang): Add unit test to test this path. - set_error_detail(ctx, "Frame output cache is full."); - return VPX_CODEC_ERROR; - } - } + // Decode in serial mode. + if (frame_count > 0) { + int i; - res = - decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline); - if (res != VPX_CODEC_OK) return res; - data_start += frame_size; - } - } else { - if (ctx->available_threads == 0) { - // No more threads for decoding. Wait until the next output worker - // finishes decoding. Then copy the decoded frame into cache. - if (ctx->num_cache_frames < FRAME_CACHE_SIZE) { - wait_worker_and_cache_frame(ctx); - } else { - // TODO(hkuang): Add unit test to test this path. - set_error_detail(ctx, "Frame output cache is full."); - return VPX_CODEC_ERROR; - } + for (i = 0; i < frame_count; ++i) { + const uint8_t *data_start_copy = data_start; + const uint32_t frame_size = frame_sizes[i]; + vpx_codec_err_t res; + if (data_start < data || frame_size > (uint32_t)(data_end - data_start)) { + set_error_detail(ctx, "Invalid frame size in index"); + return VPX_CODEC_CORRUPT_FRAME; } - res = decode_one(ctx, &data, data_sz, user_priv, deadline); + res = decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline); if (res != VPX_CODEC_OK) return res; + + data_start += frame_size; } } else { - // Decode in serial mode. - if (frame_count > 0) { - int i; - - for (i = 0; i < frame_count; ++i) { - const uint8_t *data_start_copy = data_start; - const uint32_t frame_size = frame_sizes[i]; - vpx_codec_err_t res; - if (data_start < data || - frame_size > (uint32_t)(data_end - data_start)) { - set_error_detail(ctx, "Invalid frame size in index"); - return VPX_CODEC_CORRUPT_FRAME; - } - - res = - decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline); - if (res != VPX_CODEC_OK) return res; + while (data_start < data_end) { + const uint32_t frame_size = (uint32_t)(data_end - data_start); + const vpx_codec_err_t res = + decode_one(ctx, &data_start, frame_size, user_priv, deadline); + if (res != VPX_CODEC_OK) return res; - data_start += frame_size; - } - } else { + // Account for suboptimal termination by the encoder. while (data_start < data_end) { - const uint32_t frame_size = (uint32_t)(data_end - data_start); - const vpx_codec_err_t res = - decode_one(ctx, &data_start, frame_size, user_priv, deadline); - if (res != VPX_CODEC_OK) return res; - - // Account for suboptimal termination by the encoder. - while (data_start < data_end) { - const uint8_t marker = - read_marker(ctx->decrypt_cb, ctx->decrypt_state, data_start); - if (marker) break; - ++data_start; - } + const uint8_t marker = + read_marker(ctx->decrypt_cb, ctx->decrypt_state, data_start); + if (marker) break; + ++data_start; } } } @@ -645,80 +374,28 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, return res; } -static void release_last_output_frame(vpx_codec_alg_priv_t *ctx) { - RefCntBuffer *const frame_bufs = ctx->buffer_pool->frame_bufs; - // Decrease reference count of last output frame in frame parallel mode. - if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) { - BufferPool *const pool = ctx->buffer_pool; - lock_buffer_pool(pool); - decrease_ref_count(ctx->last_show_frame, frame_bufs, pool); - unlock_buffer_pool(pool); - } -} - static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx, vpx_codec_iter_t *iter) { vpx_image_t *img = NULL; - // Only return frame when all the cpu are busy or - // application fluhsed the decoder in frame parallel decode. - if (ctx->frame_parallel_decode && ctx->available_threads > 0 && - !ctx->flushed) { - return NULL; - } + // Legacy parameter carried over from VP8. Has no effect for VP9 since we + // always return only 1 frame per decode call. + (void)iter; - // Output the frames in the cache first. - if (ctx->num_cache_frames > 0) { - release_last_output_frame(ctx); - ctx->last_show_frame = ctx->frame_cache[ctx->frame_cache_read].fb_idx; - if (ctx->need_resync) return NULL; - img = &ctx->frame_cache[ctx->frame_cache_read].img; - ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE; - --ctx->num_cache_frames; - return img; - } - - // iter acts as a flip flop, so an image is only returned on the first - // call to get_frame. - if (*iter == NULL && ctx->frame_workers != NULL) { - do { - YV12_BUFFER_CONFIG sd; - vp9_ppflags_t flags = { 0, 0, 0 }; - const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); - VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - ctx->next_output_worker_id = - (ctx->next_output_worker_id + 1) % ctx->num_frame_workers; - if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) - set_ppflags(ctx, &flags); - // Wait for the frame from worker thread. - if (winterface->sync(worker)) { - // Check if worker has received any frames. - if (frame_worker_data->received_frame == 1) { - ++ctx->available_threads; - frame_worker_data->received_frame = 0; - check_resync(ctx, frame_worker_data->pbi); - } - if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) { - VP9_COMMON *const cm = &frame_worker_data->pbi->common; - RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; - release_last_output_frame(ctx); - ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx; - if (ctx->need_resync) return NULL; - yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv); - ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; - img = &ctx->img; - return img; - } - } else { - // Decoding failed. Release the worker thread. - frame_worker_data->received_frame = 0; - ++ctx->available_threads; - ctx->need_resync = 1; - if (ctx->flushed != 1) return NULL; - } - } while (ctx->next_output_worker_id != ctx->next_submit_worker_id); + if (ctx->pbi != NULL) { + YV12_BUFFER_CONFIG sd; + vp9_ppflags_t flags = { 0, 0, 0 }; + if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) set_ppflags(ctx, &flags); + if (vp9_get_raw_frame(ctx->pbi, &sd, &flags) == 0) { + VP9_COMMON *const cm = &ctx->pbi->common; + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + ctx->last_show_frame = ctx->pbi->common.new_fb_idx; + if (ctx->need_resync) return NULL; + yuvconfig2image(&ctx->img, &sd, ctx->user_priv); + ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; + img = &ctx->img; + return img; + } } return NULL; } @@ -728,7 +405,7 @@ static vpx_codec_err_t decoder_set_fb_fn( vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) { if (cb_get == NULL || cb_release == NULL) { return VPX_CODEC_INVALID_PARAM; - } else if (ctx->frame_workers == NULL) { + } else if (ctx->pbi == NULL) { // If the decoder has already been initialized, do not accept changes to // the frame buffer functions. ctx->get_ext_fb_cb = cb_get; @@ -744,21 +421,12 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx, va_list args) { vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *); - // Only support this function in serial decode. - if (ctx->frame_parallel_decode) { - set_error_detail(ctx, "Not supported in frame parallel decode"); - return VPX_CODEC_INCAPABLE; - } - if (data) { vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data; YV12_BUFFER_CONFIG sd; - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; image2yuvconfig(&frame->img, &sd); - return vp9_set_reference_dec(&frame_worker_data->pbi->common, - ref_frame_to_vp9_reframe(frame->frame_type), - &sd); + return vp9_set_reference_dec( + &ctx->pbi->common, ref_frame_to_vp9_reframe(frame->frame_type), &sd); } else { return VPX_CODEC_INVALID_PARAM; } @@ -768,20 +436,12 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, va_list args) { vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); - // Only support this function in serial decode. - if (ctx->frame_parallel_decode) { - set_error_detail(ctx, "Not supported in frame parallel decode"); - return VPX_CODEC_INCAPABLE; - } - if (data) { vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data; YV12_BUFFER_CONFIG sd; - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; image2yuvconfig(&frame->img, &sd); - return vp9_copy_reference_dec(frame_worker_data->pbi, - (VP9_REFFRAME)frame->frame_type, &sd); + return vp9_copy_reference_dec(ctx->pbi, (VP9_REFFRAME)frame->frame_type, + &sd); } else { return VPX_CODEC_INVALID_PARAM; } @@ -791,17 +451,9 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, va_list args) { vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *); - // Only support this function in serial decode. - if (ctx->frame_parallel_decode) { - set_error_detail(ctx, "Not supported in frame parallel decode"); - return VPX_CODEC_INCAPABLE; - } - if (data) { YV12_BUFFER_CONFIG *fb; - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx); + fb = get_ref_frame(&ctx->pbi->common, data->idx); if (fb == NULL) return VPX_CODEC_ERROR; yuvconfig2image(&data->img, fb, NULL); return VPX_CODEC_OK; @@ -832,9 +484,8 @@ static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); - if (arg == NULL) return VPX_CODEC_INVALID_PARAM; - *arg = - ((FrameWorkerData *)ctx->frame_workers[0].data1)->pbi->common.base_qindex; + if (arg == NULL || ctx->pbi == NULL) return VPX_CODEC_INVALID_PARAM; + *arg = ctx->pbi->common.base_qindex; return VPX_CODEC_OK; } @@ -842,18 +493,9 @@ static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, va_list args) { int *const update_info = va_arg(args, int *); - // Only support this function in serial decode. - if (ctx->frame_parallel_decode) { - set_error_detail(ctx, "Not supported in frame parallel decode"); - return VPX_CODEC_INCAPABLE; - } - if (update_info) { - if (ctx->frame_workers) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - *update_info = frame_worker_data->pbi->refresh_frame_flags; + if (ctx->pbi != NULL) { + *update_info = ctx->pbi->refresh_frame_flags; return VPX_CODEC_OK; } else { return VPX_CODEC_ERROR; @@ -868,14 +510,9 @@ static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, int *corrupted = va_arg(args, int *); if (corrupted) { - if (ctx->frame_workers) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - RefCntBuffer *const frame_bufs = - frame_worker_data->pbi->common.buffer_pool->frame_bufs; - if (frame_worker_data->pbi->common.frame_to_show == NULL) - return VPX_CODEC_ERROR; + if (ctx->pbi != NULL) { + RefCntBuffer *const frame_bufs = ctx->pbi->common.buffer_pool->frame_bufs; + if (ctx->pbi->common.frame_to_show == NULL) return VPX_CODEC_ERROR; if (ctx->last_show_frame >= 0) *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted; return VPX_CODEC_OK; @@ -891,18 +528,9 @@ static vpx_codec_err_t ctrl_get_frame_size(vpx_codec_alg_priv_t *ctx, va_list args) { int *const frame_size = va_arg(args, int *); - // Only support this function in serial decode. - if (ctx->frame_parallel_decode) { - set_error_detail(ctx, "Not supported in frame parallel decode"); - return VPX_CODEC_INCAPABLE; - } - if (frame_size) { - if (ctx->frame_workers) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - const VP9_COMMON *const cm = &frame_worker_data->pbi->common; + if (ctx->pbi != NULL) { + const VP9_COMMON *const cm = &ctx->pbi->common; frame_size[0] = cm->width; frame_size[1] = cm->height; return VPX_CODEC_OK; @@ -918,18 +546,9 @@ static vpx_codec_err_t ctrl_get_render_size(vpx_codec_alg_priv_t *ctx, va_list args) { int *const render_size = va_arg(args, int *); - // Only support this function in serial decode. - if (ctx->frame_parallel_decode) { - set_error_detail(ctx, "Not supported in frame parallel decode"); - return VPX_CODEC_INCAPABLE; - } - if (render_size) { - if (ctx->frame_workers) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - const VP9_COMMON *const cm = &frame_worker_data->pbi->common; + if (ctx->pbi != NULL) { + const VP9_COMMON *const cm = &ctx->pbi->common; render_size[0] = cm->render_width; render_size[1] = cm->render_height; return VPX_CODEC_OK; @@ -944,13 +563,10 @@ static vpx_codec_err_t ctrl_get_render_size(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx, va_list args) { unsigned int *const bit_depth = va_arg(args, unsigned int *); - VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; if (bit_depth) { - if (worker) { - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - const VP9_COMMON *const cm = &frame_worker_data->pbi->common; + if (ctx->pbi != NULL) { + const VP9_COMMON *const cm = &ctx->pbi->common; *bit_depth = cm->bit_depth; return VPX_CODEC_OK; } else { @@ -989,10 +605,8 @@ static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_INVALID_PARAM; ctx->byte_alignment = byte_alignment; - if (ctx->frame_workers) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - frame_worker_data->pbi->common.byte_alignment = byte_alignment; + if (ctx->pbi != NULL) { + ctx->pbi->common.byte_alignment = byte_alignment; } return VPX_CODEC_OK; } @@ -1001,10 +615,8 @@ static vpx_codec_err_t ctrl_set_skip_loop_filter(vpx_codec_alg_priv_t *ctx, va_list args) { ctx->skip_loop_filter = va_arg(args, int); - if (ctx->frame_workers) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - frame_worker_data->pbi->common.skip_loop_filter = ctx->skip_loop_filter; + if (ctx->pbi != NULL) { + ctx->pbi->common.skip_loop_filter = ctx->skip_loop_filter; } return VPX_CODEC_OK; diff --git a/libvpx/vp9/vp9_dx_iface.h b/libvpx/vp9/vp9_dx_iface.h index c1559599b..18bc7ab0d 100644 --- a/libvpx/vp9/vp9_dx_iface.h +++ b/libvpx/vp9/vp9_dx_iface.h @@ -15,19 +15,12 @@ typedef vpx_codec_stream_info_t vp9_stream_info_t; -// This limit is due to framebuffer numbers. -// TODO(hkuang): Remove this limit after implementing ondemand framebuffers. -#define FRAME_CACHE_SIZE 6 // Cache maximum 6 decoded frames. - -typedef struct cache_frame { - int fb_idx; - vpx_image_t img; -} cache_frame; - struct vpx_codec_alg_priv { vpx_codec_priv_t base; vpx_codec_dec_cfg_t cfg; vp9_stream_info_t si; + VP9Decoder *pbi; + void *user_priv; int postproc_cfg_set; vp8_postproc_cfg_t postproc_cfg; vpx_decrypt_cb decrypt_cb; @@ -40,20 +33,8 @@ struct vpx_codec_alg_priv { int byte_alignment; int skip_loop_filter; - // Frame parallel related. - int frame_parallel_decode; // frame-based threading. - VPxWorker *frame_workers; - int num_frame_workers; - int next_submit_worker_id; - int last_submit_worker_id; - int next_output_worker_id; - int available_threads; - cache_frame frame_cache[FRAME_CACHE_SIZE]; - int frame_cache_write; - int frame_cache_read; - int num_cache_frames; int need_resync; // wait for key/intra-only frame - // BufferPool that holds all reference frames. Shared by all the FrameWorkers. + // BufferPool that holds all reference frames. BufferPool *buffer_pool; // External frame buffer info to save for VP9 common. diff --git a/libvpx/vp9/vp9cx.mk b/libvpx/vp9/vp9cx.mk index 47846c941..d633ed142 100644 --- a/libvpx/vp9/vp9cx.mk +++ b/libvpx/vp9/vp9cx.mk @@ -130,6 +130,7 @@ ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c endif VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c @@ -138,4 +139,10 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h +# Strip unnecessary files with CONFIG_REALTIME_ONLY +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_firstpass.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_mbgraph.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_sse4.c + VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) diff --git a/libvpx/vp9/vp9dx.mk b/libvpx/vp9/vp9dx.mk index 4c6fd0071..59f612b94 100644 --- a/libvpx/vp9/vp9dx.mk +++ b/libvpx/vp9/vp9dx.mk @@ -24,8 +24,6 @@ VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h VP9_DX_SRCS-yes += decoder/vp9_detokenize.c VP9_DX_SRCS-yes += decoder/vp9_decodemv.h VP9_DX_SRCS-yes += decoder/vp9_detokenize.h -VP9_DX_SRCS-yes += decoder/vp9_dthread.c -VP9_DX_SRCS-yes += decoder/vp9_dthread.h VP9_DX_SRCS-yes += decoder/vp9_decoder.c VP9_DX_SRCS-yes += decoder/vp9_decoder.h VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c diff --git a/libvpx/vpx/src/svc_encodeframe.c b/libvpx/vpx/src/svc_encodeframe.c index c774abb34..f633600c7 100644 --- a/libvpx/vpx/src/svc_encodeframe.c +++ b/libvpx/vpx/src/svc_encodeframe.c @@ -131,9 +131,9 @@ static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level, const char *fmt, static vpx_codec_err_t extract_option(LAYER_OPTION_TYPE type, char *input, int *value0, int *value1) { if (type == SCALE_FACTOR) { - *value0 = strtol(input, &input, 10); + *value0 = (int)strtol(input, &input, 10); if (*input++ != '/') return VPX_CODEC_INVALID_PARAM; - *value1 = strtol(input, &input, 10); + *value1 = (int)strtol(input, &input, 10); if (*value0 < option_min_values[SCALE_FACTOR] || *value1 < option_min_values[SCALE_FACTOR] || @@ -559,8 +559,7 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, iter = NULL; while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) { switch (cx_pkt->kind) { -#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) -#if CONFIG_SPATIAL_SVC +#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC) case VPX_CODEC_SPATIAL_SVC_LAYER_PSNR: { int i; for (i = 0; i < svc_ctx->spatial_layers; ++i) { @@ -595,9 +594,8 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, break; } #endif -#endif case VPX_CODEC_PSNR_PKT: { -#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) +#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC) int j; svc_log(svc_ctx, SVC_LOG_DEBUG, "frame: %d, layer: %d, PSNR(Total/Y/U/V): " diff --git a/libvpx/vpx/src/vpx_encoder.c b/libvpx/vpx/src/vpx_encoder.c index 4390cf7c8..1cf2dca69 100644 --- a/libvpx/vpx/src/vpx_encoder.c +++ b/libvpx/vpx/src/vpx_encoder.c @@ -12,8 +12,11 @@ * \brief Provides the high level interface to wrap encoder algorithms. * */ +#include <assert.h> #include <limits.h> +#include <stdlib.h> #include <string.h> +#include "vp8/common/blockd.h" #include "vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" @@ -81,6 +84,8 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( int i; void *mem_loc = NULL; + if (iface->enc.mr_get_mem_loc == NULL) return VPX_CODEC_INCAPABLE; + if (!(res = iface->enc.mr_get_mem_loc(cfg, &mem_loc))) { for (i = 0; i < num_enc; i++) { vpx_codec_priv_enc_mr_cfg_t mr_cfg; @@ -89,28 +94,27 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( if (dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 || dsf->den > dsf->num) { res = VPX_CODEC_INVALID_PARAM; - break; + } else { + mr_cfg.mr_low_res_mode_info = mem_loc; + mr_cfg.mr_total_resolutions = num_enc; + mr_cfg.mr_encoder_id = num_enc - 1 - i; + mr_cfg.mr_down_sampling_factor.num = dsf->num; + mr_cfg.mr_down_sampling_factor.den = dsf->den; + + /* Force Key-frame synchronization. Namely, encoder at higher + * resolution always use the same frame_type chosen by the + * lowest-resolution encoder. + */ + if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED; + + ctx->iface = iface; + ctx->name = iface->name; + ctx->priv = NULL; + ctx->init_flags = flags; + ctx->config.enc = cfg; + res = ctx->iface->init(ctx, &mr_cfg); } - mr_cfg.mr_low_res_mode_info = mem_loc; - mr_cfg.mr_total_resolutions = num_enc; - mr_cfg.mr_encoder_id = num_enc - 1 - i; - mr_cfg.mr_down_sampling_factor.num = dsf->num; - mr_cfg.mr_down_sampling_factor.den = dsf->den; - - /* Force Key-frame synchronization. Namely, encoder at higher - * resolution always use the same frame_type chosen by the - * lowest-resolution encoder. - */ - if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED; - - ctx->iface = iface; - ctx->name = iface->name; - ctx->priv = NULL; - ctx->init_flags = flags; - ctx->config.enc = cfg; - res = ctx->iface->init(ctx, &mr_cfg); - if (res) { const char *error_detail = ctx->priv ? ctx->priv->err_detail : NULL; /* Destroy current ctx */ @@ -124,10 +128,14 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( vpx_codec_destroy(ctx); i--; } +#if CONFIG_MULTI_RES_ENCODING + assert(mem_loc); + free(((LOWER_RES_FRAME_INFO *)mem_loc)->mb_info); + free(mem_loc); +#endif + return SAVE_STATUS(ctx, res); } - if (res) break; - ctx++; cfg++; dsf++; diff --git a/libvpx/vpx/vp8cx.h b/libvpx/vpx/vp8cx.h index ee6be4a24..c21b8b60d 100644 --- a/libvpx/vpx/vp8cx.h +++ b/libvpx/vpx/vp8cx.h @@ -333,11 +333,12 @@ enum vp8e_enc_control_id { * 2 = 4 tile columns * ..... * n = 2**n tile columns - * The requested tile columns will be capped by encoder based on image size - * limitation (The minimum width of a tile column is 256 pixel, the maximum - * is 4096). + * The requested tile columns will be capped by the encoder based on image + * size limitations (The minimum width of a tile column is 256 pixels, the + * maximum is 4096). * - * By default, the value is 0, i.e. one single column tile for entire image. + * By default, the value is 6, i.e., the maximum number of tiles supported by + * the resolution. * * Supported in codecs: VP9 */ @@ -368,10 +369,10 @@ enum vp8e_enc_control_id { * VP9 has a bitstream feature to reduce decoding dependency between frames * by turning off backward update of probability context used in encoding * and decoding. This allows staged parallel processing of more than one - * video frames in the decoder. This control function provides a mean to + * video frame in the decoder. This control function provides a means to * turn this feature on or off for bitstreams produced by encoder. * - * By default, this feature is off. + * By default, this feature is on. * * Supported in codecs: VP9 */ @@ -407,7 +408,7 @@ enum vp8e_enc_control_id { /*!\brief Codec control function to set noise sensitivity. * - * 0: off, 1: On(YOnly) + * 0: off, 1: On(YOnly), 2: For SVC only, on top two spatial layers(YOnly) * * Supported in codecs: VP9 */ @@ -443,6 +444,7 @@ enum vp8e_enc_control_id { * \note Valid parameter range: * VP9E_CONTENT_DEFAULT = Regular video content (Default) * VP9E_CONTENT_SCREEN = Screen capture content + * VP9E_CONTENT_FILM = Film content: improves grain retention * * Supported in codecs: VP9 */ @@ -695,6 +697,7 @@ typedef enum { typedef enum { VP9E_CONTENT_DEFAULT, VP9E_CONTENT_SCREEN, + VP9E_CONTENT_FILM, VP9E_CONTENT_INVALID } vp9e_tune_content; diff --git a/libvpx/vpx/vp8dx.h b/libvpx/vpx/vp8dx.h index 41c53e48d..398c67022 100644 --- a/libvpx/vpx/vp8dx.h +++ b/libvpx/vpx/vp8dx.h @@ -179,6 +179,8 @@ VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int) #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER #define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int) +#define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER +VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int) /*!\endcond */ /*! @} - end defgroup vp8_decoder */ diff --git a/libvpx/vpx/vpx_codec.h b/libvpx/vpx/vpx_codec.h index e91cd9e0d..ad05f4c74 100644 --- a/libvpx/vpx/vpx_codec.h +++ b/libvpx/vpx/vpx_codec.h @@ -46,34 +46,35 @@ extern "C" { #include "./vpx_integer.h" /*!\brief Decorator indicating a function is deprecated */ -#ifndef DEPRECATED +#ifndef VPX_DEPRECATED #if defined(__GNUC__) && __GNUC__ -#define DEPRECATED __attribute__((deprecated)) +#define VPX_DEPRECATED __attribute__((deprecated)) #elif defined(_MSC_VER) -#define DEPRECATED +#define VPX_DEPRECATED #else -#define DEPRECATED +#define VPX_DEPRECATED #endif -#endif /* DEPRECATED */ +#endif /* VPX_DEPRECATED */ -#ifndef DECLSPEC_DEPRECATED +#ifndef VPX_DECLSPEC_DEPRECATED #if defined(__GNUC__) && __GNUC__ -#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */ +#define VPX_DECLSPEC_DEPRECATED /**< \copydoc #VPX_DEPRECATED */ #elif defined(_MSC_VER) -/*!\brief \copydoc #DEPRECATED */ -#define DECLSPEC_DEPRECATED __declspec(deprecated) +/*!\brief \copydoc #VPX_DEPRECATED */ +#define VPX_DECLSPEC_DEPRECATED __declspec(deprecated) #else -#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */ +#define VPX_DECLSPEC_DEPRECATED /**< \copydoc #VPX_DEPRECATED */ #endif -#endif /* DECLSPEC_DEPRECATED */ +#endif /* VPX_DECLSPEC_DEPRECATED */ /*!\brief Decorator indicating a function is potentially unused */ -#ifdef UNUSED -#elif defined(__GNUC__) || defined(__clang__) -#define UNUSED __attribute__((unused)) +#ifndef VPX_UNUSED +#if defined(__GNUC__) || defined(__clang__) +#define VPX_UNUSED __attribute__((unused)) #else -#define UNUSED +#define VPX_UNUSED #endif +#endif /* VPX_UNUSED */ /*!\brief Current ABI version number * @@ -413,7 +414,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...); */ #define VPX_CTRL_USE_TYPE(id, typ) \ static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int, typ) \ - UNUSED; \ + VPX_UNUSED; \ \ static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx, \ int ctrl_id, typ data) { \ @@ -430,13 +431,13 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...); * It defines a static function with the correctly typed arguments as a * wrapper to the type-unsafe internal function. */ -#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \ - DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \ - vpx_codec_ctx_t *, int, typ) DEPRECATED UNUSED; \ - \ - DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \ - vpx_codec_ctx_t *ctx, int ctrl_id, typ data) { \ - return vpx_codec_control_(ctx, ctrl_id, data); \ +#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \ + VPX_DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \ + vpx_codec_ctx_t *, int, typ) VPX_DEPRECATED VPX_UNUSED; \ + \ + VPX_DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \ + vpx_codec_ctx_t *ctx, int ctrl_id, typ data) { \ + return vpx_codec_control_(ctx, ctrl_id, data); \ } /**<\hideinitializer*/ /*!\brief vpx_codec_control void type definition macro @@ -451,7 +452,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...); */ #define VPX_CTRL_VOID(id) \ static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int) \ - UNUSED; \ + VPX_UNUSED; \ \ static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx, \ int ctrl_id) { \ diff --git a/libvpx/vpx/vpx_encoder.h b/libvpx/vpx/vpx_encoder.h index c915ed671..464bc408c 100644 --- a/libvpx/vpx/vpx_encoder.h +++ b/libvpx/vpx/vpx_encoder.h @@ -63,7 +63,7 @@ extern "C" { * fields to structures */ #define VPX_ENCODER_ABI_VERSION \ - (5 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ + (6 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ /*! \brief Encoder capabilities bitfield * @@ -154,9 +154,8 @@ enum vpx_codec_cx_pkt_kind { VPX_CODEC_STATS_PKT, /**< Two-pass statistics for this frame */ VPX_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */ VPX_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */ -// Spatial SVC is still experimental and may be removed before the next ABI -// bump. -#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) +// Spatial SVC is still experimental and may be removed. +#if defined(VPX_TEST_SPATIAL_SVC) VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/ VPX_CODEC_SPATIAL_SVC_LAYER_PSNR, /**< PSNR for each layer in this frame*/ #endif @@ -192,9 +191,8 @@ typedef struct vpx_codec_cx_pkt { double psnr[4]; /**< PSNR, total/y/u/v */ } psnr; /**< data for PSNR packet */ vpx_fixed_buf_t raw; /**< data for arbitrary packets */ -// Spatial SVC is still experimental and may be removed before the next -// ABI bump. -#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) +// Spatial SVC is still experimental and may be removed. +#if defined(VPX_TEST_SPATIAL_SVC) size_t layer_sizes[VPX_SS_MAX_LAYERS]; struct vpx_psnr_pkt layer_psnr[VPX_SS_MAX_LAYERS]; #endif @@ -508,25 +506,31 @@ typedef struct vpx_codec_enc_cfg { /*!\brief Rate control adaptation undershoot control * - * This value, expressed as a percentage of the target bitrate, + * VP8: Expressed as a percentage of the target bitrate, * controls the maximum allowed adaptation speed of the codec. * This factor controls the maximum amount of bits that can * be subtracted from the target bitrate in order to compensate * for prior overshoot. - * - * Valid values in the range 0-1000. + * VP9: Expressed as a percentage of the target bitrate, a threshold + * undershoot level (current rate vs target) beyond which more agressive + * corrective measures are taken. + * * + * Valid values in the range VP8:0-1000 VP9: 0-100. */ unsigned int rc_undershoot_pct; /*!\brief Rate control adaptation overshoot control * - * This value, expressed as a percentage of the target bitrate, + * VP8: Expressed as a percentage of the target bitrate, * controls the maximum allowed adaptation speed of the codec. * This factor controls the maximum amount of bits that can * be added to the target bitrate in order to compensate for * prior undershoot. + * VP9: Expressed as a percentage of the target bitrate, a threshold + * overshoot level (current rate vs target) beyond which more agressive + * corrective measures are taken. * - * Valid values in the range 0-1000. + * Valid values in the range VP8:0-1000 VP9: 0-100. */ unsigned int rc_overshoot_pct; @@ -591,6 +595,13 @@ typedef struct vpx_codec_enc_cfg { */ unsigned int rc_2pass_vbr_maxsection_pct; + /*!\brief Two-pass corpus vbr mode complexity control + * Used only in VP9: A value representing the corpus midpoint complexity + * for corpus vbr mode. This value defaults to 0 which disables corpus vbr + * mode in favour of normal vbr mode. + */ + unsigned int rc_2pass_vbr_corpus_complexity; + /* * keyframing settings (kf) */ diff --git a/libvpx/vpx_dsp/add_noise.c b/libvpx/vpx_dsp/add_noise.c index a2b4c9010..cda6ae881 100644 --- a/libvpx/vpx_dsp/add_noise.c +++ b/libvpx/vpx_dsp/add_noise.c @@ -15,6 +15,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/postproc.h" #include "vpx_ports/mem.h" void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, diff --git a/libvpx/vpx_dsp/arm/avg_neon.c b/libvpx/vpx_dsp/arm/avg_neon.c index 257e8ffee..fa7dd0960 100644 --- a/libvpx/vpx_dsp/arm/avg_neon.c +++ b/libvpx/vpx_dsp/arm/avg_neon.c @@ -17,51 +17,35 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" -static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { - const uint32x4_t a = vpaddlq_u16(v_16x8); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); +uint32_t vpx_avg_4x4_neon(const uint8_t *a, int a_stride) { + const uint8x16_t b = load_unaligned_u8q(a, a_stride); + const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b)); + const uint32x2_t d = horizontal_add_uint16x8(c); + return vget_lane_u32(vrshr_n_u32(d, 4), 0); } -unsigned int vpx_avg_4x4_neon(const uint8_t *s, int p) { - uint16x8_t v_sum; - uint32x2_t v_s0 = vdup_n_u32(0); - uint32x2_t v_s1 = vdup_n_u32(0); - v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0); - v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1); - v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0); - v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1); - v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1)); - return (horizontal_add_u16x8(v_sum) + 8) >> 4; -} - -unsigned int vpx_avg_8x8_neon(const uint8_t *s, int p) { - uint8x8_t v_s0 = vld1_u8(s); - const uint8x8_t v_s1 = vld1_u8(s + p); - uint16x8_t v_sum = vaddl_u8(v_s0, v_s1); - - v_s0 = vld1_u8(s + 2 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 3 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 4 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 5 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 6 * p); - v_sum = vaddw_u8(v_sum, v_s0); +uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) { + int i; + uint8x8_t b, c; + uint16x8_t sum; + uint32x2_t d; + b = vld1_u8(a); + a += a_stride; + c = vld1_u8(a); + a += a_stride; + sum = vaddl_u8(b, c); + + for (i = 0; i < 6; ++i) { + const uint8x8_t d = vld1_u8(a); + a += a_stride; + sum = vaddw_u8(sum, d); + } - v_s0 = vld1_u8(s + 7 * p); - v_sum = vaddw_u8(v_sum, v_s0); + d = horizontal_add_uint16x8(sum); - return (horizontal_add_u16x8(v_sum) + 32) >> 6; + return vget_lane_u32(vrshr_n_u32(d, 6), 0); } // coeff: 16 bits, dynamic range [-32640, 32640]. @@ -155,7 +139,8 @@ int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) { ref += 16; } - return horizontal_add_u16x8(vec_sum); + return vget_lane_s16(vreinterpret_s16_u32(horizontal_add_uint16x8(vec_sum)), + 0); } // ref, src = [0, 510] - max diff = 16-bits @@ -185,7 +170,7 @@ int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) { { // Note: 'total''s pairwise addition could be implemented similarly to - // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired + // horizontal_add_uint16x8(), but one less vpaddl with 'total' when paired // with the summation of 'sse' performed better on a Cortex-A15. const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total' const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0)); diff --git a/libvpx/vpx_dsp/arm/avg_pred_neon.c b/libvpx/vpx_dsp/arm/avg_pred_neon.c new file mode 100644 index 000000000..1370ec2d2 --- /dev/null +++ b/libvpx/vpx_dsp/arm/avg_pred_neon.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" + +void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + if (width > 8) { + int x, y; + for (y = 0; y < height; ++y) { + for (x = 0; x < width; x += 16) { + const uint8x16_t p = vld1q_u8(pred + x); + const uint8x16_t r = vld1q_u8(ref + x); + const uint8x16_t avg = vrhaddq_u8(p, r); + vst1q_u8(comp + x, avg); + } + comp += width; + pred += width; + ref += ref_stride; + } + } else { + int i; + for (i = 0; i < width * height; i += 16) { + const uint8x16_t p = vld1q_u8(pred); + uint8x16_t r; + + if (width == 4) { + r = load_unaligned_u8q(ref, ref_stride); + ref += 4 * ref_stride; + } else { + const uint8x8_t r_0 = vld1_u8(ref); + const uint8x8_t r_1 = vld1_u8(ref + ref_stride); + assert(width == 8); + r = vcombine_u8(r_0, r_1); + ref += 2 * ref_stride; + } + r = vrhaddq_u8(r, p); + vst1q_u8(comp, r); + + pred += 16; + comp += 16; + } + } +} diff --git a/libvpx/vpx_dsp/arm/fdct16x16_neon.c b/libvpx/vpx_dsp/arm/fdct16x16_neon.c new file mode 100644 index 000000000..6b2bebd09 --- /dev/null +++ b/libvpx/vpx_dsp/arm/fdct16x16_neon.c @@ -0,0 +1,387 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" + +// Some builds of gcc 4.9.2 and .3 have trouble with some of the inline +// functions. +#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \ + __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4 + +void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { + vpx_fdct16x16_c(input, output, stride); +} + +#else + +static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) { + b[0] = vld1q_s16(a); + a += stride; + b[1] = vld1q_s16(a); + a += stride; + b[2] = vld1q_s16(a); + a += stride; + b[3] = vld1q_s16(a); + a += stride; + b[4] = vld1q_s16(a); + a += stride; + b[5] = vld1q_s16(a); + a += stride; + b[6] = vld1q_s16(a); + a += stride; + b[7] = vld1q_s16(a); + a += stride; + b[8] = vld1q_s16(a); + a += stride; + b[9] = vld1q_s16(a); + a += stride; + b[10] = vld1q_s16(a); + a += stride; + b[11] = vld1q_s16(a); + a += stride; + b[12] = vld1q_s16(a); + a += stride; + b[13] = vld1q_s16(a); + a += stride; + b[14] = vld1q_s16(a); + a += stride; + b[15] = vld1q_s16(a); +} + +// Store 8 16x8 values, assuming stride == 16. +static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) { + store_s16q_to_tran_low(a, b[0]); + a += 16; + store_s16q_to_tran_low(a, b[1]); + a += 16; + store_s16q_to_tran_low(a, b[2]); + a += 16; + store_s16q_to_tran_low(a, b[3]); + a += 16; + store_s16q_to_tran_low(a, b[4]); + a += 16; + store_s16q_to_tran_low(a, b[5]); + a += 16; + store_s16q_to_tran_low(a, b[6]); + a += 16; + store_s16q_to_tran_low(a, b[7]); +} + +// Load step of each pass. Add and subtract clear across the input, requiring +// all 16 values to be loaded. For the first pass it also multiplies by 4. + +// To maybe reduce register usage this could be combined with the load() step to +// get the first 4 and last 4 values, cross those, then load the middle 8 values +// and cross them. +static INLINE void cross_input(const int16x8_t *a /*[16]*/, + int16x8_t *b /*[16]*/, const int pass) { + if (pass == 0) { + b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2); + b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2); + b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2); + b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2); + b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2); + b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2); + b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2); + b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2); + + b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2); + b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2); + b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2); + b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2); + b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2); + b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2); + b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2); + b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2); + } else { + b[0] = vaddq_s16(a[0], a[15]); + b[1] = vaddq_s16(a[1], a[14]); + b[2] = vaddq_s16(a[2], a[13]); + b[3] = vaddq_s16(a[3], a[12]); + b[4] = vaddq_s16(a[4], a[11]); + b[5] = vaddq_s16(a[5], a[10]); + b[6] = vaddq_s16(a[6], a[9]); + b[7] = vaddq_s16(a[7], a[8]); + + b[8] = vsubq_s16(a[7], a[8]); + b[9] = vsubq_s16(a[6], a[9]); + b[10] = vsubq_s16(a[5], a[10]); + b[11] = vsubq_s16(a[4], a[11]); + b[12] = vsubq_s16(a[3], a[12]); + b[13] = vsubq_s16(a[2], a[13]); + b[14] = vsubq_s16(a[1], a[14]); + b[15] = vsubq_s16(a[0], a[15]); + } +} + +// Quarter round at the beginning of the second pass. Can't use vrshr (rounding) +// because this only adds 1, not 1 << 2. +static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) { + const int16x8_t one = vdupq_n_s16(1); + a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2); + a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2); + a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2); + a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2); + a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2); + a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2); + a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2); + a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2); + a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2); + a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2); + a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2); + a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2); + a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2); + a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2); + a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2); + a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2); +} + +// fdct_round_shift((a +/- b) * c) +static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b, + const tran_high_t c, int16x8_t *add, + int16x8_t *sub) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c); + const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c); + const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c); + const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14); + const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14); + const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14); + const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14); + *add = vcombine_s16(rounded0, rounded1); + *sub = vcombine_s16(rounded2, rounded3); +} + +// fdct_round_shift(a * c0 +/- b * c1) +static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, + const tran_coef_t c0, + const tran_coef_t c1, int16x8_t *add, + int16x8_t *sub) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0); + const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1); + const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1); + const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0); + const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1); + const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14); + const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14); + const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14); + const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14); + *add = vcombine_s16(rounded0, rounded1); + *sub = vcombine_s16(rounded2, rounded3); +} + +// Transpose 8x8 to a new location. Don't use transpose_neon.h because those +// are all in-place. +static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/, + int16x8_t *b /*[8]*/) { + // Swap 16 bit elements. + const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); + const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); + const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); + const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); + + // Swap 32 bit elements. + const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), + vreinterpretq_s32_s16(c1.val[0])); + const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), + vreinterpretq_s32_s16(c1.val[1])); + const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), + vreinterpretq_s32_s16(c3.val[0])); + const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), + vreinterpretq_s32_s16(c3.val[1])); + + // Swap 64 bit elements + const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); + const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); + const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); + const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); + + b[0] = e0.val[0]; + b[1] = e1.val[0]; + b[2] = e2.val[0]; + b[3] = e3.val[0]; + b[4] = e0.val[1]; + b[5] = e1.val[1]; + b[6] = e2.val[1]; + b[7] = e3.val[1]; +} + +// Main body of fdct16x16. +static void dct_body(const int16x8_t *in /*[16]*/, int16x8_t *out /*[16]*/) { + int16x8_t s[8]; + int16x8_t x[4]; + int16x8_t step[8]; + + // stage 1 + // From fwd_txfm.c: Work on the first eight values; fdct8(input, + // even_results);" + s[0] = vaddq_s16(in[0], in[7]); + s[1] = vaddq_s16(in[1], in[6]); + s[2] = vaddq_s16(in[2], in[5]); + s[3] = vaddq_s16(in[3], in[4]); + s[4] = vsubq_s16(in[3], in[4]); + s[5] = vsubq_s16(in[2], in[5]); + s[6] = vsubq_s16(in[1], in[6]); + s[7] = vsubq_s16(in[0], in[7]); + + // fdct4(step, step); + x[0] = vaddq_s16(s[0], s[3]); + x[1] = vaddq_s16(s[1], s[2]); + x[2] = vsubq_s16(s[1], s[2]); + x[3] = vsubq_s16(s[0], s[3]); + + // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) + // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]); + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); + butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]); + + // Stage 2 + // Re-using source s5/s6 + // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) + // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) + butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]); + + // Stage 3 + x[0] = vaddq_s16(s[4], s[5]); + x[1] = vsubq_s16(s[4], s[5]); + x[2] = vsubq_s16(s[7], s[6]); + x[3] = vaddq_s16(s[7], s[6]); + + // Stage 4 + // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]); + // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]); + + // step 2 + // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" + // That file distinguished between "in_high" and "step1" but the only + // difference is that "in_high" is the first 8 values and "step 1" is the + // second. Here, since they are all in one array, "step1" values are += 8. + + // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) + // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) + // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) + // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) + butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]); + butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]); + + // step 3 + s[0] = vaddq_s16(in[8], s[3]); + s[1] = vaddq_s16(in[9], s[2]); + x[0] = vsubq_s16(in[9], s[2]); + x[1] = vsubq_s16(in[8], s[3]); + x[2] = vsubq_s16(in[15], s[4]); + x[3] = vsubq_s16(in[14], s[5]); + s[6] = vaddq_s16(in[14], s[5]); + s[7] = vaddq_s16(in[15], s[4]); + + // step 4 + // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64) + // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64) + butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]); + + // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64) + butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]); + + // step 5 + step[0] = vaddq_s16(s[0], s[1]); + step[1] = vsubq_s16(s[0], s[1]); + step[2] = vaddq_s16(x[1], s[2]); + step[3] = vsubq_s16(x[1], s[2]); + step[4] = vsubq_s16(x[2], s[5]); + step[5] = vaddq_s16(x[2], s[5]); + step[6] = vsubq_s16(s[7], s[6]); + step[7] = vaddq_s16(s[7], s[6]); + + // step 6 + // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64) + // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64) + // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64) + // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64) + // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64) + // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] * + // cospi_22_64) + // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64) + // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64) + butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9], + &out[7]); + butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1], + &out[15]); + butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13], + &out[3]); + butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5], + &out[11]); +} + +void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { + int16x8_t temp0[16]; + int16x8_t temp1[16]; + int16x8_t temp2[16]; + int16x8_t temp3[16]; + + // Left half. + load(input, stride, temp0); + cross_input(temp0, temp1, 0); + dct_body(temp1, temp0); + + // Right half. + load(input + 8, stride, temp1); + cross_input(temp1, temp2, 0); + dct_body(temp2, temp1); + + // Transpose top left and top right quarters into one contiguous location to + // process to the top half. + transpose_8x8(&temp0[0], &temp2[0]); + transpose_8x8(&temp1[0], &temp2[8]); + partial_round_shift(temp2); + cross_input(temp2, temp3, 1); + dct_body(temp3, temp2); + transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4], + &temp2[5], &temp2[6], &temp2[7]); + transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12], + &temp2[13], &temp2[14], &temp2[15]); + store(output, temp2); + store(output + 8, temp2 + 8); + output += 8 * 16; + + // Transpose bottom left and bottom right quarters into one contiguous + // location to process to the bottom half. + transpose_8x8(&temp0[8], &temp1[0]); + transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], + &temp1[13], &temp1[14], &temp1[15]); + partial_round_shift(temp1); + cross_input(temp1, temp0, 1); + dct_body(temp0, temp1); + transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4], + &temp1[5], &temp1[6], &temp1[7]); + transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], + &temp1[13], &temp1[14], &temp1[15]); + store(output, temp1); + store(output + 8, temp1 + 8); +} +#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && + // __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4 diff --git a/libvpx/vpx_dsp/arm/fdct32x32_neon.c b/libvpx/vpx_dsp/arm/fdct32x32_neon.c new file mode 100644 index 000000000..e9cd34904 --- /dev/null +++ b/libvpx/vpx_dsp/arm/fdct32x32_neon.c @@ -0,0 +1,1507 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" + +// Most gcc 4.9 distributions outside of Android do not generate correct code +// for this function. +#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \ + __GNUC__ == 4 && __GNUC_MINOR__ <= 9 + +void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { + vpx_fdct32x32_c(input, output, stride); +} + +void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, + int stride) { + vpx_fdct32x32_rd_c(input, output, stride); +} + +#else + +#define LOAD_INCREMENT(src, stride, dest, index) \ + do { \ + dest[index] = vld1q_s16(src); \ + src += stride; \ + } while (0) + +#define ADD_S16(src, index0, index1, dest, index3) \ + do { \ + dest[index3] = vaddq_s16(src[index0], src[index1]); \ + } while (0) + +#define ADD_SHIFT_S16(src, index0, index1) \ + do { \ + src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \ + } while (0) + +// Load, cross, and multiply by 4. Load the first 8 and last 8, then the +// middle +// 16. Doing sets of 16 at a time. Maybe sets of 8 would be better? +static INLINE void load(const int16_t *a, int stride, int16x8_t *b) { + const int16_t *a_end = a + 24 * stride; + int16x8_t c[8]; + + LOAD_INCREMENT(a, stride, b, 0); + LOAD_INCREMENT(a, stride, b, 1); + LOAD_INCREMENT(a, stride, b, 2); + LOAD_INCREMENT(a, stride, b, 3); + LOAD_INCREMENT(a, stride, b, 4); + LOAD_INCREMENT(a, stride, b, 5); + LOAD_INCREMENT(a, stride, b, 6); + LOAD_INCREMENT(a, stride, b, 7); + + LOAD_INCREMENT(a_end, stride, b, 24); + LOAD_INCREMENT(a_end, stride, b, 25); + LOAD_INCREMENT(a_end, stride, b, 26); + LOAD_INCREMENT(a_end, stride, b, 27); + LOAD_INCREMENT(a_end, stride, b, 28); + LOAD_INCREMENT(a_end, stride, b, 29); + LOAD_INCREMENT(a_end, stride, b, 30); + LOAD_INCREMENT(a_end, stride, b, 31); + + ADD_S16(b, 0, 31, c, 0); + ADD_S16(b, 1, 30, c, 1); + ADD_S16(b, 2, 29, c, 2); + ADD_S16(b, 3, 28, c, 3); + ADD_S16(b, 4, 27, c, 4); + ADD_S16(b, 5, 26, c, 5); + ADD_S16(b, 6, 25, c, 6); + ADD_S16(b, 7, 24, c, 7); + + ADD_SHIFT_S16(b, 7, 24); + ADD_SHIFT_S16(b, 6, 25); + ADD_SHIFT_S16(b, 5, 26); + ADD_SHIFT_S16(b, 4, 27); + ADD_SHIFT_S16(b, 3, 28); + ADD_SHIFT_S16(b, 2, 29); + ADD_SHIFT_S16(b, 1, 30); + ADD_SHIFT_S16(b, 0, 31); + + b[0] = vshlq_n_s16(c[0], 2); + b[1] = vshlq_n_s16(c[1], 2); + b[2] = vshlq_n_s16(c[2], 2); + b[3] = vshlq_n_s16(c[3], 2); + b[4] = vshlq_n_s16(c[4], 2); + b[5] = vshlq_n_s16(c[5], 2); + b[6] = vshlq_n_s16(c[6], 2); + b[7] = vshlq_n_s16(c[7], 2); + + LOAD_INCREMENT(a, stride, b, 8); + LOAD_INCREMENT(a, stride, b, 9); + LOAD_INCREMENT(a, stride, b, 10); + LOAD_INCREMENT(a, stride, b, 11); + LOAD_INCREMENT(a, stride, b, 12); + LOAD_INCREMENT(a, stride, b, 13); + LOAD_INCREMENT(a, stride, b, 14); + LOAD_INCREMENT(a, stride, b, 15); + LOAD_INCREMENT(a, stride, b, 16); + LOAD_INCREMENT(a, stride, b, 17); + LOAD_INCREMENT(a, stride, b, 18); + LOAD_INCREMENT(a, stride, b, 19); + LOAD_INCREMENT(a, stride, b, 20); + LOAD_INCREMENT(a, stride, b, 21); + LOAD_INCREMENT(a, stride, b, 22); + LOAD_INCREMENT(a, stride, b, 23); + + ADD_S16(b, 8, 23, c, 0); + ADD_S16(b, 9, 22, c, 1); + ADD_S16(b, 10, 21, c, 2); + ADD_S16(b, 11, 20, c, 3); + ADD_S16(b, 12, 19, c, 4); + ADD_S16(b, 13, 18, c, 5); + ADD_S16(b, 14, 17, c, 6); + ADD_S16(b, 15, 16, c, 7); + + ADD_SHIFT_S16(b, 15, 16); + ADD_SHIFT_S16(b, 14, 17); + ADD_SHIFT_S16(b, 13, 18); + ADD_SHIFT_S16(b, 12, 19); + ADD_SHIFT_S16(b, 11, 20); + ADD_SHIFT_S16(b, 10, 21); + ADD_SHIFT_S16(b, 9, 22); + ADD_SHIFT_S16(b, 8, 23); + + b[8] = vshlq_n_s16(c[0], 2); + b[9] = vshlq_n_s16(c[1], 2); + b[10] = vshlq_n_s16(c[2], 2); + b[11] = vshlq_n_s16(c[3], 2); + b[12] = vshlq_n_s16(c[4], 2); + b[13] = vshlq_n_s16(c[5], 2); + b[14] = vshlq_n_s16(c[6], 2); + b[15] = vshlq_n_s16(c[7], 2); +} + +#undef LOAD_INCREMENT +#undef ADD_S16 +#undef ADD_SHIFT_S16 + +#define STORE_S16(src, index, dest) \ + do { \ + store_s16q_to_tran_low(dest, src[index]); \ + dest += 8; \ + } while (0); + +// Store 32 16x8 values, assuming stride == 32. +// Slight twist: store horizontally in blocks of 8. +static INLINE void store(tran_low_t *a, const int16x8_t *b) { + STORE_S16(b, 0, a); + STORE_S16(b, 8, a); + STORE_S16(b, 16, a); + STORE_S16(b, 24, a); + STORE_S16(b, 1, a); + STORE_S16(b, 9, a); + STORE_S16(b, 17, a); + STORE_S16(b, 25, a); + STORE_S16(b, 2, a); + STORE_S16(b, 10, a); + STORE_S16(b, 18, a); + STORE_S16(b, 26, a); + STORE_S16(b, 3, a); + STORE_S16(b, 11, a); + STORE_S16(b, 19, a); + STORE_S16(b, 27, a); + STORE_S16(b, 4, a); + STORE_S16(b, 12, a); + STORE_S16(b, 20, a); + STORE_S16(b, 28, a); + STORE_S16(b, 5, a); + STORE_S16(b, 13, a); + STORE_S16(b, 21, a); + STORE_S16(b, 29, a); + STORE_S16(b, 6, a); + STORE_S16(b, 14, a); + STORE_S16(b, 22, a); + STORE_S16(b, 30, a); + STORE_S16(b, 7, a); + STORE_S16(b, 15, a); + STORE_S16(b, 23, a); + STORE_S16(b, 31, a); +} + +#undef STORE_S16 + +// fdct_round_shift((a +/- b) * c) +static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b, + const tran_high_t constant, + int16x8_t *add, int16x8_t *sub) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant); + const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant); + const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant); + const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); + const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); + const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); + const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); + *add = vcombine_s16(rounded0, rounded1); + *sub = vcombine_s16(rounded2, rounded3); +} + +// fdct_round_shift(a * c0 +/- b * c1) +static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, + const tran_coef_t constant0, + const tran_coef_t constant1, + int16x8_t *add, int16x8_t *sub) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0); + const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1); + const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1); + const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0); + const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1); + const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); + const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); + const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); + const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); + *add = vcombine_s16(rounded0, rounded1); + *sub = vcombine_s16(rounded2, rounded3); +} + +// Add 2 if positive, 1 if negative, and shift by 2. +// In practice, subtract the sign bit, then shift with rounding. +static INLINE int16x8_t sub_round_shift(const int16x8_t a) { + const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); + const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); + const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); + return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2); +} + +static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // Mini cross. X the first 16 values and the middle 8 of the second half. + a[0] = vaddq_s16(in[0], in[15]); + a[1] = vaddq_s16(in[1], in[14]); + a[2] = vaddq_s16(in[2], in[13]); + a[3] = vaddq_s16(in[3], in[12]); + a[4] = vaddq_s16(in[4], in[11]); + a[5] = vaddq_s16(in[5], in[10]); + a[6] = vaddq_s16(in[6], in[9]); + a[7] = vaddq_s16(in[7], in[8]); + + a[8] = vsubq_s16(in[7], in[8]); + a[9] = vsubq_s16(in[6], in[9]); + a[10] = vsubq_s16(in[5], in[10]); + a[11] = vsubq_s16(in[4], in[11]); + a[12] = vsubq_s16(in[3], in[12]); + a[13] = vsubq_s16(in[2], in[13]); + a[14] = vsubq_s16(in[1], in[14]); + a[15] = vsubq_s16(in[0], in[15]); + + a[16] = in[16]; + a[17] = in[17]; + a[18] = in[18]; + a[19] = in[19]; + + butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]); + butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]); + butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]); + butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]); + + a[28] = in[28]; + a[29] = in[29]; + a[30] = in[30]; + a[31] = in[31]; + + // Stage 3. + b[0] = vaddq_s16(a[0], a[7]); + b[1] = vaddq_s16(a[1], a[6]); + b[2] = vaddq_s16(a[2], a[5]); + b[3] = vaddq_s16(a[3], a[4]); + + b[4] = vsubq_s16(a[3], a[4]); + b[5] = vsubq_s16(a[2], a[5]); + b[6] = vsubq_s16(a[1], a[6]); + b[7] = vsubq_s16(a[0], a[7]); + + b[8] = a[8]; + b[9] = a[9]; + + butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]); + butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]); + + b[14] = a[14]; + b[15] = a[15]; + + b[16] = vaddq_s16(in[16], a[23]); + b[17] = vaddq_s16(in[17], a[22]); + b[18] = vaddq_s16(in[18], a[21]); + b[19] = vaddq_s16(in[19], a[20]); + + b[20] = vsubq_s16(in[19], a[20]); + b[21] = vsubq_s16(in[18], a[21]); + b[22] = vsubq_s16(in[17], a[22]); + b[23] = vsubq_s16(in[16], a[23]); + + b[24] = vsubq_s16(in[31], a[24]); + b[25] = vsubq_s16(in[30], a[25]); + b[26] = vsubq_s16(in[29], a[26]); + b[27] = vsubq_s16(in[28], a[27]); + + b[28] = vaddq_s16(in[28], a[27]); + b[29] = vaddq_s16(in[29], a[26]); + b[30] = vaddq_s16(in[30], a[25]); + b[31] = vaddq_s16(in[31], a[24]); + + // Stage 4. + a[0] = vaddq_s16(b[0], b[3]); + a[1] = vaddq_s16(b[1], b[2]); + a[2] = vsubq_s16(b[1], b[2]); + a[3] = vsubq_s16(b[0], b[3]); + + a[4] = b[4]; + + butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]); + + a[7] = b[7]; + + a[8] = vaddq_s16(b[8], b[11]); + a[9] = vaddq_s16(b[9], b[10]); + a[10] = vsubq_s16(b[9], b[10]); + a[11] = vsubq_s16(b[8], b[11]); + a[12] = vsubq_s16(b[15], b[12]); + a[13] = vsubq_s16(b[14], b[13]); + a[14] = vaddq_s16(b[14], b[13]); + a[15] = vaddq_s16(b[15], b[12]); + + a[16] = b[16]; + a[17] = b[17]; + + butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]); + butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]); + butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]); + butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]); + + a[22] = b[22]; + a[23] = b[23]; + a[24] = b[24]; + a[25] = b[25]; + + a[30] = b[30]; + a[31] = b[31]; + + // Stage 5. + butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]); + butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]); + + b[4] = vaddq_s16(a[4], a[5]); + b[5] = vsubq_s16(a[4], a[5]); + b[6] = vsubq_s16(a[7], a[6]); + b[7] = vaddq_s16(a[7], a[6]); + + b[8] = a[8]; + + butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]); + butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]); + + b[11] = a[11]; + b[12] = a[12]; + + b[15] = a[15]; + + b[16] = vaddq_s16(a[19], a[16]); + b[17] = vaddq_s16(a[18], a[17]); + b[18] = vsubq_s16(a[17], a[18]); + b[19] = vsubq_s16(a[16], a[19]); + b[20] = vsubq_s16(a[23], a[20]); + b[21] = vsubq_s16(a[22], a[21]); + b[22] = vaddq_s16(a[21], a[22]); + b[23] = vaddq_s16(a[20], a[23]); + b[24] = vaddq_s16(a[27], a[24]); + b[25] = vaddq_s16(a[26], a[25]); + b[26] = vsubq_s16(a[25], a[26]); + b[27] = vsubq_s16(a[24], a[27]); + b[28] = vsubq_s16(a[31], a[28]); + b[29] = vsubq_s16(a[30], a[29]); + b[30] = vaddq_s16(a[29], a[30]); + b[31] = vaddq_s16(a[28], a[31]); + + // Stage 6. + a[0] = b[0]; + a[1] = b[1]; + a[2] = b[2]; + a[3] = b[3]; + + butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]); + butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]); + + a[8] = vaddq_s16(b[8], b[9]); + a[9] = vsubq_s16(b[8], b[9]); + a[10] = vsubq_s16(b[11], b[10]); + a[11] = vaddq_s16(b[11], b[10]); + a[12] = vaddq_s16(b[12], b[13]); + a[13] = vsubq_s16(b[12], b[13]); + a[14] = vsubq_s16(b[15], b[14]); + a[15] = vaddq_s16(b[15], b[14]); + + a[16] = b[16]; + a[19] = b[19]; + a[20] = b[20]; + a[23] = b[23]; + a[24] = b[24]; + a[27] = b[27]; + a[28] = b[28]; + a[31] = b[31]; + + butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]); + butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]); + + butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]); + butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]); + + // Stage 7. + b[0] = a[0]; + b[1] = a[1]; + b[2] = a[2]; + b[3] = a[3]; + b[4] = a[4]; + b[5] = a[5]; + b[6] = a[6]; + b[7] = a[7]; + + butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]); + butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]); + butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]); + butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]); + + b[16] = vaddq_s16(a[16], a[17]); + b[17] = vsubq_s16(a[16], a[17]); + b[18] = vsubq_s16(a[19], a[18]); + b[19] = vaddq_s16(a[19], a[18]); + b[20] = vaddq_s16(a[20], a[21]); + b[21] = vsubq_s16(a[20], a[21]); + b[22] = vsubq_s16(a[23], a[22]); + b[23] = vaddq_s16(a[23], a[22]); + b[24] = vaddq_s16(a[24], a[25]); + b[25] = vsubq_s16(a[24], a[25]); + b[26] = vsubq_s16(a[27], a[26]); + b[27] = vaddq_s16(a[27], a[26]); + b[28] = vaddq_s16(a[28], a[29]); + b[29] = vsubq_s16(a[28], a[29]); + b[30] = vsubq_s16(a[31], a[30]); + b[31] = vaddq_s16(a[31], a[30]); + + // Final stage. + // Also compute partial rounding shift: + // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + out[0] = sub_round_shift(b[0]); + out[16] = sub_round_shift(b[1]); + out[8] = sub_round_shift(b[2]); + out[24] = sub_round_shift(b[3]); + out[4] = sub_round_shift(b[4]); + out[20] = sub_round_shift(b[5]); + out[12] = sub_round_shift(b[6]); + out[28] = sub_round_shift(b[7]); + out[2] = sub_round_shift(b[8]); + out[18] = sub_round_shift(b[9]); + out[10] = sub_round_shift(b[10]); + out[26] = sub_round_shift(b[11]); + out[6] = sub_round_shift(b[12]); + out[22] = sub_round_shift(b[13]); + out[14] = sub_round_shift(b[14]); + out[30] = sub_round_shift(b[15]); + + butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]); + out[1] = sub_round_shift(a[1]); + out[31] = sub_round_shift(a[31]); + + butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]); + out[17] = sub_round_shift(a[17]); + out[15] = sub_round_shift(a[15]); + + butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]); + out[9] = sub_round_shift(a[9]); + out[23] = sub_round_shift(a[23]); + + butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]); + out[25] = sub_round_shift(a[25]); + out[7] = sub_round_shift(a[7]); + + butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]); + out[5] = sub_round_shift(a[5]); + out[27] = sub_round_shift(a[27]); + + butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]); + out[21] = sub_round_shift(a[21]); + out[11] = sub_round_shift(a[11]); + + butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]); + out[13] = sub_round_shift(a[13]); + out[19] = sub_round_shift(a[19]); + + butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]); + out[29] = sub_round_shift(a[29]); + out[3] = sub_round_shift(a[3]); +} + +#define PASS_THROUGH(src, dst, element) \ + do { \ + dst##_lo[element] = src##_lo[element]; \ + dst##_hi[element] = src##_hi[element]; \ + } while (0) + +#define ADD_S16_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = \ + vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ + b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \ + vget_high_s16(a[right_index])); \ + } while (0) + +#define SUB_S16_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = \ + vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ + b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \ + vget_high_s16(a[right_index])); \ + } while (0) + +#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \ + do { \ + c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \ + c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \ + } while (0) + +#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \ + do { \ + temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \ + temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \ + c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \ + c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \ + } while (0) + +#define ADD_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \ + b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \ + } while (0) + +#define SUB_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \ + b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \ + } while (0) + +// Like butterfly_one_coeff, but don't narrow results. +static INLINE void butterfly_one_coeff_s16_s32( + const int16x8_t a, const int16x8_t b, const tran_high_t constant, + int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, + int32x4_t *sub_hi) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant); + const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant); + const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant); + *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); + *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); + *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); + *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); +} + +#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \ + add_index, sub_index) \ + do { \ + butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \ + &b##_lo[add_index], &b##_hi[add_index], \ + &b##_lo[sub_index], &b##_hi[sub_index]); \ + } while (0) + +// Like butterfly_one_coeff, but with s32. +static INLINE void butterfly_one_coeff_s32( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo, + int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant); + const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant); + const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant); + const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant); + const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant); + const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant); + *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); + *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); + *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); + *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); +} + +#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \ + sub_index) \ + do { \ + butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index], \ + a##_lo[right_index], a##_hi[right_index], \ + constant, &b##_lo[add_index], &b##_hi[add_index], \ + &b##_lo[sub_index], &b##_hi[sub_index]); \ + } while (0) + +// Like butterfly_two_coeff, but with s32. +static INLINE void butterfly_two_coeff_s32( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const int32_t constant0, const int32_t constant1, + int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, + int32x4_t *sub_hi) { + const int32x4_t a0 = vmulq_n_s32(a_lo, constant0); + const int32x4_t a1 = vmulq_n_s32(a_hi, constant0); + const int32x4_t a2 = vmulq_n_s32(a_lo, constant1); + const int32x4_t a3 = vmulq_n_s32(a_hi, constant1); + const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0); + const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0); + const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1); + const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1); + *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); + *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); + *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); + *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); +} + +#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \ + right_constant, b, add_index, sub_index) \ + do { \ + butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \ + a##_lo[right_index], a##_hi[right_index], \ + left_constant, right_constant, &b##_lo[add_index], \ + &b##_hi[add_index], &b##_lo[sub_index], \ + &b##_hi[sub_index]); \ + } while (0) + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift without rounding. +static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo, + const int32x4_t a_hi) { + const int32x4_t one = vdupq_n_s32(1); + const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo); + const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31); + const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32); + const int16x4_t b_lo = + vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2); + const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi); + const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31); + const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32); + const int16x4_t b_hi = + vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2); + return vcombine_s16(b_lo, b_hi); +} + +static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + int32x4_t c_lo[32]; + int32x4_t c_hi[32]; + int32x4_t d_lo[32]; + int32x4_t d_hi[32]; + + // Stage 1. Done as part of the load for the first pass. + a[0] = vaddq_s16(in[0], in[31]); + a[1] = vaddq_s16(in[1], in[30]); + a[2] = vaddq_s16(in[2], in[29]); + a[3] = vaddq_s16(in[3], in[28]); + a[4] = vaddq_s16(in[4], in[27]); + a[5] = vaddq_s16(in[5], in[26]); + a[6] = vaddq_s16(in[6], in[25]); + a[7] = vaddq_s16(in[7], in[24]); + a[8] = vaddq_s16(in[8], in[23]); + a[9] = vaddq_s16(in[9], in[22]); + a[10] = vaddq_s16(in[10], in[21]); + a[11] = vaddq_s16(in[11], in[20]); + a[12] = vaddq_s16(in[12], in[19]); + a[13] = vaddq_s16(in[13], in[18]); + a[14] = vaddq_s16(in[14], in[17]); + a[15] = vaddq_s16(in[15], in[16]); + a[16] = vsubq_s16(in[15], in[16]); + a[17] = vsubq_s16(in[14], in[17]); + a[18] = vsubq_s16(in[13], in[18]); + a[19] = vsubq_s16(in[12], in[19]); + a[20] = vsubq_s16(in[11], in[20]); + a[21] = vsubq_s16(in[10], in[21]); + a[22] = vsubq_s16(in[9], in[22]); + a[23] = vsubq_s16(in[8], in[23]); + a[24] = vsubq_s16(in[7], in[24]); + a[25] = vsubq_s16(in[6], in[25]); + a[26] = vsubq_s16(in[5], in[26]); + a[27] = vsubq_s16(in[4], in[27]); + a[28] = vsubq_s16(in[3], in[28]); + a[29] = vsubq_s16(in[2], in[29]); + a[30] = vsubq_s16(in[1], in[30]); + a[31] = vsubq_s16(in[0], in[31]); + + // Stage 2. + b[0] = vaddq_s16(a[0], a[15]); + b[1] = vaddq_s16(a[1], a[14]); + b[2] = vaddq_s16(a[2], a[13]); + b[3] = vaddq_s16(a[3], a[12]); + b[4] = vaddq_s16(a[4], a[11]); + b[5] = vaddq_s16(a[5], a[10]); + b[6] = vaddq_s16(a[6], a[9]); + b[7] = vaddq_s16(a[7], a[8]); + + b[8] = vsubq_s16(a[7], a[8]); + b[9] = vsubq_s16(a[6], a[9]); + b[10] = vsubq_s16(a[5], a[10]); + b[11] = vsubq_s16(a[4], a[11]); + b[12] = vsubq_s16(a[3], a[12]); + b[13] = vsubq_s16(a[2], a[13]); + b[14] = vsubq_s16(a[1], a[14]); + b[15] = vsubq_s16(a[0], a[15]); + + b[16] = a[16]; + b[17] = a[17]; + b[18] = a[18]; + b[19] = a[19]; + + butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]); + butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]); + butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]); + butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]); + + b[28] = a[28]; + b[29] = a[29]; + b[30] = a[30]; + b[31] = a[31]; + + // Stage 3. With extreme values for input this calculation rolls over int16_t. + // The sources for b[0] get added multiple times and, through testing, have + // been shown to overflow starting here. + ADD_S16_S32(b, 0, 7, c, 0); + ADD_S16_S32(b, 1, 6, c, 1); + ADD_S16_S32(b, 2, 5, c, 2); + ADD_S16_S32(b, 3, 4, c, 3); + SUB_S16_S32(b, 3, 4, c, 4); + SUB_S16_S32(b, 2, 5, c, 5); + SUB_S16_S32(b, 1, 6, c, 6); + SUB_S16_S32(b, 0, 7, c, 7); + + a[8] = b[8]; + a[9] = b[9]; + + BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10); + BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11); + + a[14] = b[14]; + a[15] = b[15]; + + ADD_S16_S32(b, 16, 23, c, 16); + ADD_S16_S32(b, 17, 22, c, 17); + ADD_S16_S32(b, 18, 21, c, 18); + ADD_S16_S32(b, 19, 20, c, 19); + SUB_S16_S32(b, 19, 20, c, 20); + SUB_S16_S32(b, 18, 21, c, 21); + SUB_S16_S32(b, 17, 22, c, 22); + SUB_S16_S32(b, 16, 23, c, 23); + SUB_S16_S32(b, 31, 24, c, 24); + SUB_S16_S32(b, 30, 25, c, 25); + SUB_S16_S32(b, 29, 26, c, 26); + SUB_S16_S32(b, 28, 27, c, 27); + ADD_S16_S32(b, 28, 27, c, 28); + ADD_S16_S32(b, 29, 26, c, 29); + ADD_S16_S32(b, 30, 25, c, 30); + ADD_S16_S32(b, 31, 24, c, 31); + + // Stage 4. + ADD_S32(c, 0, 3, d, 0); + ADD_S32(c, 1, 2, d, 1); + SUB_S32(c, 1, 2, d, 2); + SUB_S32(c, 0, 3, d, 3); + + PASS_THROUGH(c, d, 4); + + BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5); + + PASS_THROUGH(c, d, 7); + + ADDW_S16_S32(c, 11, a, 8, d, 8); + ADDW_S16_S32(c, 10, a, 9, d, 9); + SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10); + SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11); + SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12); + SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13); + ADDW_S16_S32(c, 13, b, 14, d, 14); + ADDW_S16_S32(c, 12, b, 15, d, 15); + + PASS_THROUGH(c, d, 16); + PASS_THROUGH(c, d, 17); + + BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18); + BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19); + BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20); + BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21); + + PASS_THROUGH(c, d, 22); + PASS_THROUGH(c, d, 23); + PASS_THROUGH(c, d, 24); + PASS_THROUGH(c, d, 25); + + PASS_THROUGH(c, d, 30); + PASS_THROUGH(c, d, 31); + + // Stage 5. + BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1); + BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3); + + ADD_S32(d, 4, 5, c, 4); + SUB_S32(d, 4, 5, c, 5); + SUB_S32(d, 7, 6, c, 6); + ADD_S32(d, 7, 6, c, 7); + + PASS_THROUGH(d, c, 8); + + BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9); + BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10); + + PASS_THROUGH(d, c, 11); + PASS_THROUGH(d, c, 12); + PASS_THROUGH(d, c, 15); + + ADD_S32(d, 16, 19, c, 16); + ADD_S32(d, 17, 18, c, 17); + SUB_S32(d, 17, 18, c, 18); + SUB_S32(d, 16, 19, c, 19); + SUB_S32(d, 23, 20, c, 20); + SUB_S32(d, 22, 21, c, 21); + ADD_S32(d, 22, 21, c, 22); + ADD_S32(d, 23, 20, c, 23); + ADD_S32(d, 24, 27, c, 24); + ADD_S32(d, 25, 26, c, 25); + SUB_S32(d, 25, 26, c, 26); + SUB_S32(d, 24, 27, c, 27); + SUB_S32(d, 31, 28, c, 28); + SUB_S32(d, 30, 29, c, 29); + ADD_S32(d, 30, 29, c, 30); + ADD_S32(d, 31, 28, c, 31); + + // Stage 6. + PASS_THROUGH(c, d, 0); + PASS_THROUGH(c, d, 1); + PASS_THROUGH(c, d, 2); + PASS_THROUGH(c, d, 3); + + BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7); + BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6); + + ADD_S32(c, 8, 9, d, 8); + SUB_S32(c, 8, 9, d, 9); + SUB_S32(c, 11, 10, d, 10); + ADD_S32(c, 11, 10, d, 11); + ADD_S32(c, 12, 13, d, 12); + SUB_S32(c, 12, 13, d, 13); + SUB_S32(c, 15, 14, d, 14); + ADD_S32(c, 15, 14, d, 15); + + PASS_THROUGH(c, d, 16); + PASS_THROUGH(c, d, 19); + PASS_THROUGH(c, d, 20); + PASS_THROUGH(c, d, 23); + PASS_THROUGH(c, d, 24); + PASS_THROUGH(c, d, 27); + PASS_THROUGH(c, d, 28); + PASS_THROUGH(c, d, 31); + + BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17); + BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18); + BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21); + BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22); + + // Stage 7. + PASS_THROUGH(d, c, 0); + PASS_THROUGH(d, c, 1); + PASS_THROUGH(d, c, 2); + PASS_THROUGH(d, c, 3); + PASS_THROUGH(d, c, 4); + PASS_THROUGH(d, c, 5); + PASS_THROUGH(d, c, 6); + PASS_THROUGH(d, c, 7); + + BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15); + BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14); + BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13); + BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12); + + ADD_S32(d, 16, 17, c, 16); + SUB_S32(d, 16, 17, c, 17); + SUB_S32(d, 19, 18, c, 18); + ADD_S32(d, 19, 18, c, 19); + ADD_S32(d, 20, 21, c, 20); + SUB_S32(d, 20, 21, c, 21); + SUB_S32(d, 23, 22, c, 22); + ADD_S32(d, 23, 22, c, 23); + ADD_S32(d, 24, 25, c, 24); + SUB_S32(d, 24, 25, c, 25); + SUB_S32(d, 27, 26, c, 26); + ADD_S32(d, 27, 26, c, 27); + ADD_S32(d, 28, 29, c, 28); + SUB_S32(d, 28, 29, c, 29); + SUB_S32(d, 31, 30, c, 30); + ADD_S32(d, 31, 30, c, 31); + + // Final stage. + // Roll rounding into this function so we can pass back int16x8. + + out[0] = add_round_shift_s32(c_lo[0], c_hi[0]); + out[16] = add_round_shift_s32(c_lo[1], c_hi[1]); + + out[8] = add_round_shift_s32(c_lo[2], c_hi[2]); + out[24] = add_round_shift_s32(c_lo[3], c_hi[3]); + out[4] = add_round_shift_s32(c_lo[4], c_hi[4]); + out[20] = add_round_shift_s32(c_lo[5], c_hi[5]); + out[12] = add_round_shift_s32(c_lo[6], c_hi[6]); + + out[28] = add_round_shift_s32(c_lo[7], c_hi[7]); + out[2] = add_round_shift_s32(c_lo[8], c_hi[8]); + out[18] = add_round_shift_s32(c_lo[9], c_hi[9]); + out[10] = add_round_shift_s32(c_lo[10], c_hi[10]); + + out[26] = add_round_shift_s32(c_lo[11], c_hi[11]); + out[6] = add_round_shift_s32(c_lo[12], c_hi[12]); + out[22] = add_round_shift_s32(c_lo[13], c_hi[13]); + out[14] = add_round_shift_s32(c_lo[14], c_hi[14]); + out[30] = add_round_shift_s32(c_lo[15], c_hi[15]); + + BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31); + out[1] = add_round_shift_s32(d_lo[1], d_hi[1]); + out[31] = add_round_shift_s32(d_lo[31], d_hi[31]); + + BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15); + out[17] = add_round_shift_s32(d_lo[17], d_hi[17]); + out[15] = add_round_shift_s32(d_lo[15], d_hi[15]); + + BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23); + out[9] = add_round_shift_s32(d_lo[9], d_hi[9]); + out[23] = add_round_shift_s32(d_lo[23], d_hi[23]); + + BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7); + out[25] = add_round_shift_s32(d_lo[25], d_hi[25]); + out[7] = add_round_shift_s32(d_lo[7], d_hi[7]); + + BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27); + out[5] = add_round_shift_s32(d_lo[5], d_hi[5]); + out[27] = add_round_shift_s32(d_lo[27], d_hi[27]); + + BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11); + out[21] = add_round_shift_s32(d_lo[21], d_hi[21]); + out[11] = add_round_shift_s32(d_lo[11], d_hi[11]); + + BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19); + out[13] = add_round_shift_s32(d_lo[13], d_hi[13]); + out[19] = add_round_shift_s32(d_lo[19], d_hi[19]); + + BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3); + out[29] = add_round_shift_s32(d_lo[29], d_hi[29]); + out[3] = add_round_shift_s32(d_lo[3], d_hi[3]); +} + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift without rounding. +static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) { + const int16x8_t one = vdupq_n_s16(1); + const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); + const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); + const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); + return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2); +} + +static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + + // Stage 1. Done as part of the load for the first pass. + a[0] = vaddq_s16(in[0], in[31]); + a[1] = vaddq_s16(in[1], in[30]); + a[2] = vaddq_s16(in[2], in[29]); + a[3] = vaddq_s16(in[3], in[28]); + a[4] = vaddq_s16(in[4], in[27]); + a[5] = vaddq_s16(in[5], in[26]); + a[6] = vaddq_s16(in[6], in[25]); + a[7] = vaddq_s16(in[7], in[24]); + a[8] = vaddq_s16(in[8], in[23]); + a[9] = vaddq_s16(in[9], in[22]); + a[10] = vaddq_s16(in[10], in[21]); + a[11] = vaddq_s16(in[11], in[20]); + a[12] = vaddq_s16(in[12], in[19]); + a[13] = vaddq_s16(in[13], in[18]); + a[14] = vaddq_s16(in[14], in[17]); + a[15] = vaddq_s16(in[15], in[16]); + a[16] = vsubq_s16(in[15], in[16]); + a[17] = vsubq_s16(in[14], in[17]); + a[18] = vsubq_s16(in[13], in[18]); + a[19] = vsubq_s16(in[12], in[19]); + a[20] = vsubq_s16(in[11], in[20]); + a[21] = vsubq_s16(in[10], in[21]); + a[22] = vsubq_s16(in[9], in[22]); + a[23] = vsubq_s16(in[8], in[23]); + a[24] = vsubq_s16(in[7], in[24]); + a[25] = vsubq_s16(in[6], in[25]); + a[26] = vsubq_s16(in[5], in[26]); + a[27] = vsubq_s16(in[4], in[27]); + a[28] = vsubq_s16(in[3], in[28]); + a[29] = vsubq_s16(in[2], in[29]); + a[30] = vsubq_s16(in[1], in[30]); + a[31] = vsubq_s16(in[0], in[31]); + + // Stage 2. + // For the "rd" version, all the values are rounded down after stage 2 to keep + // the values in 16 bits. + b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15])); + b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14])); + b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13])); + b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12])); + b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11])); + b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10])); + b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9])); + b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8])); + + b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8])); + b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9])); + b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10])); + b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11])); + b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12])); + b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13])); + b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14])); + b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15])); + + b[16] = add_round_shift_s16(a[16]); + b[17] = add_round_shift_s16(a[17]); + b[18] = add_round_shift_s16(a[18]); + b[19] = add_round_shift_s16(a[19]); + + butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]); + butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]); + butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]); + butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]); + b[20] = add_round_shift_s16(b[20]); + b[21] = add_round_shift_s16(b[21]); + b[22] = add_round_shift_s16(b[22]); + b[23] = add_round_shift_s16(b[23]); + b[24] = add_round_shift_s16(b[24]); + b[25] = add_round_shift_s16(b[25]); + b[26] = add_round_shift_s16(b[26]); + b[27] = add_round_shift_s16(b[27]); + + b[28] = add_round_shift_s16(a[28]); + b[29] = add_round_shift_s16(a[29]); + b[30] = add_round_shift_s16(a[30]); + b[31] = add_round_shift_s16(a[31]); + + // Stage 3. + a[0] = vaddq_s16(b[0], b[7]); + a[1] = vaddq_s16(b[1], b[6]); + a[2] = vaddq_s16(b[2], b[5]); + a[3] = vaddq_s16(b[3], b[4]); + + a[4] = vsubq_s16(b[3], b[4]); + a[5] = vsubq_s16(b[2], b[5]); + a[6] = vsubq_s16(b[1], b[6]); + a[7] = vsubq_s16(b[0], b[7]); + + a[8] = b[8]; + a[9] = b[9]; + + butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]); + butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]); + + a[14] = b[14]; + a[15] = b[15]; + + a[16] = vaddq_s16(b[16], b[23]); + a[17] = vaddq_s16(b[17], b[22]); + a[18] = vaddq_s16(b[18], b[21]); + a[19] = vaddq_s16(b[19], b[20]); + + a[20] = vsubq_s16(b[19], b[20]); + a[21] = vsubq_s16(b[18], b[21]); + a[22] = vsubq_s16(b[17], b[22]); + a[23] = vsubq_s16(b[16], b[23]); + + a[24] = vsubq_s16(b[31], b[24]); + a[25] = vsubq_s16(b[30], b[25]); + a[26] = vsubq_s16(b[29], b[26]); + a[27] = vsubq_s16(b[28], b[27]); + + a[28] = vaddq_s16(b[28], b[27]); + a[29] = vaddq_s16(b[29], b[26]); + a[30] = vaddq_s16(b[30], b[25]); + a[31] = vaddq_s16(b[31], b[24]); + + // Stage 4. + b[0] = vaddq_s16(a[0], a[3]); + b[1] = vaddq_s16(a[1], a[2]); + b[2] = vsubq_s16(a[1], a[2]); + b[3] = vsubq_s16(a[0], a[3]); + + b[4] = a[4]; + + butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]); + + b[7] = a[7]; + + b[8] = vaddq_s16(a[8], a[11]); + b[9] = vaddq_s16(a[9], a[10]); + b[10] = vsubq_s16(a[9], a[10]); + b[11] = vsubq_s16(a[8], a[11]); + b[12] = vsubq_s16(a[15], a[12]); + b[13] = vsubq_s16(a[14], a[13]); + b[14] = vaddq_s16(a[14], a[13]); + b[15] = vaddq_s16(a[15], a[12]); + + b[16] = a[16]; + b[17] = a[17]; + + butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]); + butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]); + butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]); + butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]); + + b[22] = a[22]; + b[23] = a[23]; + b[24] = a[24]; + b[25] = a[25]; + + b[30] = a[30]; + b[31] = a[31]; + + // Stage 5. + butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]); + butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]); + + a[4] = vaddq_s16(b[4], b[5]); + a[5] = vsubq_s16(b[4], b[5]); + a[6] = vsubq_s16(b[7], b[6]); + a[7] = vaddq_s16(b[7], b[6]); + + a[8] = b[8]; + + butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]); + butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]); + + a[11] = b[11]; + a[12] = b[12]; + + a[15] = b[15]; + + a[16] = vaddq_s16(b[19], b[16]); + a[17] = vaddq_s16(b[18], b[17]); + a[18] = vsubq_s16(b[17], b[18]); + a[19] = vsubq_s16(b[16], b[19]); + a[20] = vsubq_s16(b[23], b[20]); + a[21] = vsubq_s16(b[22], b[21]); + a[22] = vaddq_s16(b[21], b[22]); + a[23] = vaddq_s16(b[20], b[23]); + a[24] = vaddq_s16(b[27], b[24]); + a[25] = vaddq_s16(b[26], b[25]); + a[26] = vsubq_s16(b[25], b[26]); + a[27] = vsubq_s16(b[24], b[27]); + a[28] = vsubq_s16(b[31], b[28]); + a[29] = vsubq_s16(b[30], b[29]); + a[30] = vaddq_s16(b[29], b[30]); + a[31] = vaddq_s16(b[28], b[31]); + + // Stage 6. + b[0] = a[0]; + b[1] = a[1]; + b[2] = a[2]; + b[3] = a[3]; + + butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]); + butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]); + + b[8] = vaddq_s16(a[8], a[9]); + b[9] = vsubq_s16(a[8], a[9]); + b[10] = vsubq_s16(a[11], a[10]); + b[11] = vaddq_s16(a[11], a[10]); + b[12] = vaddq_s16(a[12], a[13]); + b[13] = vsubq_s16(a[12], a[13]); + b[14] = vsubq_s16(a[15], a[14]); + b[15] = vaddq_s16(a[15], a[14]); + + b[16] = a[16]; + b[19] = a[19]; + b[20] = a[20]; + b[23] = a[23]; + b[24] = a[24]; + b[27] = a[27]; + b[28] = a[28]; + b[31] = a[31]; + + butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]); + butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]); + + butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]); + butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]); + + // Stage 7. + a[0] = b[0]; + a[1] = b[1]; + a[2] = b[2]; + a[3] = b[3]; + a[4] = b[4]; + a[5] = b[5]; + a[6] = b[6]; + a[7] = b[7]; + + butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]); + butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]); + butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]); + butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]); + + a[16] = vaddq_s16(b[16], b[17]); + a[17] = vsubq_s16(b[16], b[17]); + a[18] = vsubq_s16(b[19], b[18]); + a[19] = vaddq_s16(b[19], b[18]); + a[20] = vaddq_s16(b[20], b[21]); + a[21] = vsubq_s16(b[20], b[21]); + a[22] = vsubq_s16(b[23], b[22]); + a[23] = vaddq_s16(b[23], b[22]); + a[24] = vaddq_s16(b[24], b[25]); + a[25] = vsubq_s16(b[24], b[25]); + a[26] = vsubq_s16(b[27], b[26]); + a[27] = vaddq_s16(b[27], b[26]); + a[28] = vaddq_s16(b[28], b[29]); + a[29] = vsubq_s16(b[28], b[29]); + a[30] = vsubq_s16(b[31], b[30]); + a[31] = vaddq_s16(b[31], b[30]); + + // Final stage. + out[0] = a[0]; + out[16] = a[1]; + out[8] = a[2]; + out[24] = a[3]; + out[4] = a[4]; + out[20] = a[5]; + out[12] = a[6]; + out[28] = a[7]; + out[2] = a[8]; + out[18] = a[9]; + out[10] = a[10]; + out[26] = a[11]; + out[6] = a[12]; + out[22] = a[13]; + out[14] = a[14]; + out[30] = a[15]; + + butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]); + butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17], + &out[15]); + butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]); + butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]); + butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]); + butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21], + &out[11]); + butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13], + &out[19]); + butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]); +} + +#undef PASS_THROUGH +#undef ADD_S16_S32 +#undef SUB_S16_S32 +#undef ADDW_S16_S32 +#undef SUBW_S16_S32 +#undef ADD_S32 +#undef SUB_S32 +#undef BUTTERFLY_ONE_S16_S32 +#undef BUTTERFLY_ONE_S32 +#undef BUTTERFLY_TWO_S32 + +// Transpose 8x8 to a new location. Don't use transpose_neon.h because those +// are all in-place. +// TODO(johannkoenig): share with other fdcts. +static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) { + // Swap 16 bit elements. + const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); + const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); + const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); + const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); + + // Swap 32 bit elements. + const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), + vreinterpretq_s32_s16(c1.val[0])); + const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), + vreinterpretq_s32_s16(c1.val[1])); + const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), + vreinterpretq_s32_s16(c3.val[0])); + const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), + vreinterpretq_s32_s16(c3.val[1])); + + // Swap 64 bit elements + const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); + const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); + const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); + const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); + + b[0] = e0.val[0]; + b[1] = e1.val[0]; + b[2] = e2.val[0]; + b[3] = e3.val[0]; + b[4] = e0.val[1]; + b[5] = e1.val[1]; + b[6] = e2.val[1]; + b[7] = e3.val[1]; +} + +void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { + int16x8_t temp0[32]; + int16x8_t temp1[32]; + int16x8_t temp2[32]; + int16x8_t temp3[32]; + int16x8_t temp4[32]; + int16x8_t temp5[32]; + + // Process in 8x32 columns. + load(input, stride, temp0); + dct_body_first_pass(temp0, temp1); + + load(input + 8, stride, temp0); + dct_body_first_pass(temp0, temp2); + + load(input + 16, stride, temp0); + dct_body_first_pass(temp0, temp3); + + load(input + 24, stride, temp0); + dct_body_first_pass(temp0, temp4); + + // Generate the top row by munging the first set of 8 from each one together. + transpose_8x8(&temp1[0], &temp0[0]); + transpose_8x8(&temp2[0], &temp0[8]); + transpose_8x8(&temp3[0], &temp0[16]); + transpose_8x8(&temp4[0], &temp0[24]); + + dct_body_second_pass(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output, temp5); + + // Second row of 8x32. + transpose_8x8(&temp1[8], &temp0[0]); + transpose_8x8(&temp2[8], &temp0[8]); + transpose_8x8(&temp3[8], &temp0[16]); + transpose_8x8(&temp4[8], &temp0[24]); + + dct_body_second_pass(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 8 * 32, temp5); + + // Third row of 8x32 + transpose_8x8(&temp1[16], &temp0[0]); + transpose_8x8(&temp2[16], &temp0[8]); + transpose_8x8(&temp3[16], &temp0[16]); + transpose_8x8(&temp4[16], &temp0[24]); + + dct_body_second_pass(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 16 * 32, temp5); + + // Final row of 8x32. + transpose_8x8(&temp1[24], &temp0[0]); + transpose_8x8(&temp2[24], &temp0[8]); + transpose_8x8(&temp3[24], &temp0[16]); + transpose_8x8(&temp4[24], &temp0[24]); + + dct_body_second_pass(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 24 * 32, temp5); +} + +void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, + int stride) { + int16x8_t temp0[32]; + int16x8_t temp1[32]; + int16x8_t temp2[32]; + int16x8_t temp3[32]; + int16x8_t temp4[32]; + int16x8_t temp5[32]; + + // Process in 8x32 columns. + load(input, stride, temp0); + dct_body_first_pass(temp0, temp1); + + load(input + 8, stride, temp0); + dct_body_first_pass(temp0, temp2); + + load(input + 16, stride, temp0); + dct_body_first_pass(temp0, temp3); + + load(input + 24, stride, temp0); + dct_body_first_pass(temp0, temp4); + + // Generate the top row by munging the first set of 8 from each one together. + transpose_8x8(&temp1[0], &temp0[0]); + transpose_8x8(&temp2[0], &temp0[8]); + transpose_8x8(&temp3[0], &temp0[16]); + transpose_8x8(&temp4[0], &temp0[24]); + + dct_body_second_pass_rd(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output, temp5); + + // Second row of 8x32. + transpose_8x8(&temp1[8], &temp0[0]); + transpose_8x8(&temp2[8], &temp0[8]); + transpose_8x8(&temp3[8], &temp0[16]); + transpose_8x8(&temp4[8], &temp0[24]); + + dct_body_second_pass_rd(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 8 * 32, temp5); + + // Third row of 8x32 + transpose_8x8(&temp1[16], &temp0[0]); + transpose_8x8(&temp2[16], &temp0[8]); + transpose_8x8(&temp3[16], &temp0[16]); + transpose_8x8(&temp4[16], &temp0[24]); + + dct_body_second_pass_rd(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 16 * 32, temp5); + + // Final row of 8x32. + transpose_8x8(&temp1[24], &temp0[0]); + transpose_8x8(&temp2[24], &temp0[8]); + transpose_8x8(&temp3[24], &temp0[16]); + transpose_8x8(&temp4[24], &temp0[24]); + + dct_body_second_pass_rd(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 24 * 32, temp5); +} +#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && + // __GNUC__ == 4 && __GNUC_MINOR__ <= 9 diff --git a/libvpx/vpx_dsp/arm/fdct_neon.c b/libvpx/vpx_dsp/arm/fdct_neon.c index fe78f3f51..04646ed2e 100644 --- a/libvpx/vpx_dsp/arm/fdct_neon.c +++ b/libvpx/vpx_dsp/arm/fdct_neon.c @@ -50,8 +50,8 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1); const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1); - const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, (int16_t)cospi_16_64); - const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, (int16_t)cospi_16_64); + const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64); + const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64); // fdct_round_shift int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS); @@ -59,13 +59,11 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, // s_3 * cospi_8_64 + s_2 * cospi_24_64 // s_3 * cospi_24_64 - s_2 * cospi_8_64 - const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, (int16_t)cospi_8_64); - const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, (int16_t)cospi_24_64); + const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64); + const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64); - const int32x4_t temp3 = - vmlal_n_s16(s_3_cospi_8_64, s_2, (int16_t)cospi_24_64); - const int32x4_t temp4 = - vmlsl_n_s16(s_3_cospi_24_64, s_2, (int16_t)cospi_8_64); + const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64); + const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64); // fdct_round_shift int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS); diff --git a/libvpx/vpx_dsp/arm/fdct_partial_neon.c b/libvpx/vpx_dsp/arm/fdct_partial_neon.c new file mode 100644 index 000000000..e73de41d7 --- /dev/null +++ b/libvpx/vpx_dsp/arm/fdct_partial_neon.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE tran_low_t get_lane(const int32x2_t a) { +#if CONFIG_VP9_HIGHBITDEPTH + return vget_lane_s32(a, 0); +#else + return vget_lane_s16(vreinterpret_s16_s32(a), 0); +#endif // CONFIG_VP9_HIGHBITDETPH +} + +void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) { + int16x4_t a0, a1, a2, a3; + int16x8_t b0, b1; + int16x8_t c; + int32x2_t d; + + a0 = vld1_s16(input); + input += stride; + a1 = vld1_s16(input); + input += stride; + a2 = vld1_s16(input); + input += stride; + a3 = vld1_s16(input); + + b0 = vcombine_s16(a0, a1); + b1 = vcombine_s16(a2, a3); + + c = vaddq_s16(b0, b1); + + d = horizontal_add_int16x8(c); + + output[0] = get_lane(vshl_n_s32(d, 1)); + output[1] = 0; +} + +void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) { + int r; + int16x8_t sum = vld1q_s16(&input[0]); + + for (r = 1; r < 8; ++r) { + const int16x8_t input_00 = vld1q_s16(&input[r * stride]); + sum = vaddq_s16(sum, input_00); + } + + output[0] = get_lane(horizontal_add_int16x8(sum)); + output[1] = 0; +} + +void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int r; + int16x8_t left = vld1q_s16(input); + int16x8_t right = vld1q_s16(input + 8); + int32x2_t sum; + input += stride; + + for (r = 1; r < 16; ++r) { + const int16x8_t a = vld1q_s16(input); + const int16x8_t b = vld1q_s16(input + 8); + input += stride; + left = vaddq_s16(left, a); + right = vaddq_s16(right, b); + } + + sum = vadd_s32(horizontal_add_int16x8(left), horizontal_add_int16x8(right)); + + output[0] = get_lane(vshr_n_s32(sum, 1)); + output[1] = 0; +} + +void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int r; + int16x8_t a0 = vld1q_s16(input); + int16x8_t a1 = vld1q_s16(input + 8); + int16x8_t a2 = vld1q_s16(input + 16); + int16x8_t a3 = vld1q_s16(input + 24); + int32x2_t sum; + input += stride; + + for (r = 1; r < 32; ++r) { + const int16x8_t b0 = vld1q_s16(input); + const int16x8_t b1 = vld1q_s16(input + 8); + const int16x8_t b2 = vld1q_s16(input + 16); + const int16x8_t b3 = vld1q_s16(input + 24); + input += stride; + a0 = vaddq_s16(a0, b0); + a1 = vaddq_s16(a1, b1); + a2 = vaddq_s16(a2, b2); + a3 = vaddq_s16(a3, b3); + } + + sum = vadd_s32(horizontal_add_int16x8(a0), horizontal_add_int16x8(a1)); + sum = vadd_s32(sum, horizontal_add_int16x8(a2)); + sum = vadd_s32(sum, horizontal_add_int16x8(a3)); + output[0] = get_lane(vshr_n_s32(sum, 3)); + output[1] = 0; +} diff --git a/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/libvpx/vpx_dsp/arm/fwd_txfm_neon.c index c449b4660..8049277b1 100644 --- a/libvpx/vpx_dsp/arm/fwd_txfm_neon.c +++ b/libvpx/vpx_dsp/arm/fwd_txfm_neon.c @@ -48,18 +48,18 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); - int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64); - int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64); - int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64); - int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64); - v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64); - v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64); - v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64); - v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64); - v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64); - v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64); - v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64); - v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64); + int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64); + int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64); + int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64); + int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64); + v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64); + v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64); + v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64); + v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64); + v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64); + v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64); { const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); @@ -77,10 +77,10 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, // Stage 2 v_x0 = vsubq_s16(v_s6, v_s5); v_x1 = vaddq_s16(v_s6, v_s5); - v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64); - v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64); - v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64); - v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64); + v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64); { const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); @@ -95,22 +95,22 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, v_x3 = vaddq_s16(v_s7, cd); } // Stage 4 - v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64); - v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64); - v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64); - v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64); - v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64); - v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64); - v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64); - v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64); - v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64); - v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64); - v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64); - v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64); - v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64); - v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64); - v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64); - v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64); + v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64); + v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64); + v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64); + v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64); + v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64); + v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64); + v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64); + v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64); + v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64); + v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64); + v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64); { const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); @@ -207,24 +207,3 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, store_s16q_to_tran_low(final_output + 7 * 8, input_7); } } - -void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) { - int r; - int16x8_t sum = vld1q_s16(&input[0]); - for (r = 1; r < 8; ++r) { - const int16x8_t input_00 = vld1q_s16(&input[r * stride]); - sum = vaddq_s16(sum, input_00); - } - { - const int32x4_t a = vpaddlq_s16(sum); - const int64x2_t b = vpaddlq_s32(a); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); -#if CONFIG_VP9_HIGHBITDEPTH - output[0] = vget_lane_s32(c, 0); -#else - output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0); -#endif - output[1] = 0; - } -} diff --git a/libvpx/vpx_dsp/arm/hadamard_neon.c b/libvpx/vpx_dsp/arm/hadamard_neon.c index 79bedd848..523a63c6f 100644 --- a/libvpx/vpx_dsp/arm/hadamard_neon.c +++ b/libvpx/vpx_dsp/arm/hadamard_neon.c @@ -47,7 +47,7 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, *a7 = vaddq_s16(c1, c5); } -void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, +void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int16x8_t a0 = vld1q_s16(src_diff); int16x8_t a1 = vld1q_s16(src_diff + src_stride); @@ -76,7 +76,7 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, store_s16q_to_tran_low(coeff + 56, a7); } -void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, +void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int i; diff --git a/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c index 98e42cd25..5358839b5 100644 --- a/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c +++ b/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c @@ -1410,10 +1410,10 @@ static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest, void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - const tran_low_t out0 = - HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - const tran_low_t out1 = - HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); + const tran_low_t out0 = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + const tran_low_t out1 = HIGHBD_WRAPLOW( + dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); const int16x8_t dc = vdupq_n_s16(a1); int i; diff --git a/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c b/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c index 63eb49678..c1354c0c1 100644 --- a/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c +++ b/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c @@ -61,10 +61,10 @@ static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest, void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - const tran_low_t out0 = - HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - const tran_low_t out1 = - HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); + const tran_low_t out0 = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + const tran_low_t out1 = HIGHBD_WRAPLOW( + dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); const int16x8_t dc = vdupq_n_s16(a1); int i; diff --git a/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c index 20b09f683..1418a75a1 100644 --- a/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c +++ b/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c @@ -54,10 +54,10 @@ static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest, void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const int16x8_t max = vdupq_n_s16((1 << bd) - 1); - const tran_low_t out0 = - HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - const tran_low_t out1 = - HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); + const tran_low_t out0 = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + const tran_low_t out1 = HIGHBD_WRAPLOW( + dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4); const int16x8_t dc = vdupq_n_s16(a1); diff --git a/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c index 6687e7649..dd90134a6 100644 --- a/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c +++ b/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c @@ -38,10 +38,10 @@ static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest, void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - const tran_low_t out0 = - HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - const tran_low_t out1 = - HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); + const tran_low_t out0 = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + const tran_low_t out1 = HIGHBD_WRAPLOW( + dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5); const int16x8_t dc = vdupq_n_s16(a1); diff --git a/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c index 74345e1fa..c46c01631 100644 --- a/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c +++ b/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c @@ -17,8 +17,9 @@ #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_ports/mem.h" -static INLINE void load_4x4(const int16_t *s, ptrdiff_t p, int16x4_t *s0, - int16x4_t *s1, int16x4_t *s2, int16x4_t *s3) { +static INLINE void load_4x4(const int16_t *s, const ptrdiff_t p, + int16x4_t *const s0, int16x4_t *const s1, + int16x4_t *const s2, int16x4_t *const s3) { *s0 = vld1_s16(s); s += p; *s1 = vld1_s16(s); @@ -28,8 +29,9 @@ static INLINE void load_4x4(const int16_t *s, ptrdiff_t p, int16x4_t *s0, *s3 = vld1_s16(s); } -static INLINE void load_8x4(const uint16_t *s, ptrdiff_t p, uint16x8_t *s0, - uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3) { +static INLINE void load_8x4(const uint16_t *s, const ptrdiff_t p, + uint16x8_t *const s0, uint16x8_t *const s1, + uint16x8_t *const s2, uint16x8_t *const s3) { *s0 = vld1q_u16(s); s += p; *s1 = vld1q_u16(s); @@ -39,10 +41,11 @@ static INLINE void load_8x4(const uint16_t *s, ptrdiff_t p, uint16x8_t *s0, *s3 = vld1q_u16(s); } -static INLINE void load_8x8(const int16_t *s, ptrdiff_t p, int16x8_t *s0, - int16x8_t *s1, int16x8_t *s2, int16x8_t *s3, - int16x8_t *s4, int16x8_t *s5, int16x8_t *s6, - int16x8_t *s7) { +static INLINE void load_8x8(const int16_t *s, const ptrdiff_t p, + int16x8_t *const s0, int16x8_t *const s1, + int16x8_t *const s2, int16x8_t *const s3, + int16x8_t *const s4, int16x8_t *const s5, + int16x8_t *const s6, int16x8_t *const s7) { *s0 = vld1q_s16(s); s += p; *s1 = vld1q_s16(s); @@ -60,11 +63,11 @@ static INLINE void load_8x8(const int16_t *s, ptrdiff_t p, int16x8_t *s0, *s7 = vld1q_s16(s); } -static INLINE void store_8x8(uint16_t *s, ptrdiff_t p, const uint16x8_t s0, - const uint16x8_t s1, const uint16x8_t s2, - const uint16x8_t s3, const uint16x8_t s4, - const uint16x8_t s5, const uint16x8_t s6, - const uint16x8_t s7) { +static INLINE void store_8x8(uint16_t *s, const ptrdiff_t p, + const uint16x8_t s0, const uint16x8_t s1, + const uint16x8_t s2, const uint16x8_t s3, + const uint16x8_t s4, const uint16x8_t s5, + const uint16x8_t s6, const uint16x8_t s7) { vst1q_u16(s, s0); s += p; vst1q_u16(s, s1); @@ -82,16 +85,15 @@ static INLINE void store_8x8(uint16_t *s, ptrdiff_t p, const uint16x8_t s0, vst1q_u16(s, s7); } -static INLINE int32x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, - const int16x4_t s2, const int16x4_t s3, - const int16x4_t s4, const int16x4_t s5, - const int16x4_t s6, const int16x4_t s7, - const int16x8_t filters) { +static INLINE int32x4_t highbd_convolve8_4( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x8_t filters) { const int16x4_t filters_lo = vget_low_s16(filters); const int16x4_t filters_hi = vget_high_s16(filters); - int32x4_t sum = vdupq_n_s32(0); + int32x4_t sum; - sum = vmlal_lane_s16(sum, s0, filters_lo, 0); + sum = vmull_lane_s16(s0, filters_lo, 0); sum = vmlal_lane_s16(sum, s1, filters_lo, 1); sum = vmlal_lane_s16(sum, s2, filters_lo, 2); sum = vmlal_lane_s16(sum, s3, filters_lo, 3); @@ -102,19 +104,17 @@ static INLINE int32x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, return sum; } -static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, - const int16x8_t s2, const int16x8_t s3, - const int16x8_t s4, const int16x8_t s5, - const int16x8_t s6, const int16x8_t s7, - const int16x8_t filters, - const uint16x8_t max) { +static INLINE uint16x8_t +highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filters, const uint16x8_t max) { const int16x4_t filters_lo = vget_low_s16(filters); const int16x4_t filters_hi = vget_high_s16(filters); - int32x4_t sum0 = vdupq_n_s32(0); - int32x4_t sum1 = vdupq_n_s32(0); + int32x4_t sum0, sum1; uint16x8_t d; - sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filters_lo, 0); + sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3); @@ -122,7 +122,7 @@ static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3); - sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filters_lo, 0); + sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3); @@ -137,15 +137,14 @@ static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { if (x_step_q4 != 16) { - vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h, bd); + vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); } else { - const int16x8_t filters = vld1q_s16(filter_x); + const int16x8_t filters = vld1q_s16(filter[x0_q4]); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); uint16x8_t t0, t1, t2, t3; @@ -182,10 +181,10 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10); transpose_s16_4x4d(&s7, &s8, &s9, &s10); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); @@ -241,10 +240,11 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); __builtin_prefetch(src + 7 * src_stride); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + d3 = + highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); transpose_u16_8x4(&d0, &d1, &d2, &d3); vst1_u16(dst, vget_low_u16(d0)); @@ -302,14 +302,22 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, &s12, &s13, &s14); transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); - d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, max); - d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, max); - d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, max); - d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, max); + d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, + max); + d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, + max); + d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, + max); + d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, + max); + d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, + max); + d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, + max); + d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, + max); + d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, + filters, max); transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); @@ -337,15 +345,15 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused - int w, int h, int bd) { + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, + int y_step_q4, int w, int h, int bd) { if (x_step_q4 != 16) { - vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h, bd); + vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); } else { - const int16x8_t filters = vld1q_s16(filter_x); + const int16x8_t filters = vld1q_s16(filter[x0_q4]); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); uint16x8_t t0, t1, t2, t3; @@ -382,10 +390,10 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10); transpose_s16_4x4d(&s7, &s8, &s9, &s10); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); @@ -448,10 +456,11 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); __builtin_prefetch(src + 7 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + t3 = + highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); transpose_u16_8x4(&t0, &t1, &t2, &t3); d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), @@ -522,14 +531,22 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, &s12, &s13, &s14); transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); - d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, max); - d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, max); - d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, max); - d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, max); + d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, + max); + d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, + max); + d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, + max); + d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, + max); + d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, + max); + d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, + max); + d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, + max); + d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, + filters, max); transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); @@ -566,15 +583,14 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { if (y_step_q4 != 16) { - vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h, bd); + vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); } else { - const int16x8_t filters = vld1q_s16(filter_y); + const int16x8_t filters = vld1q_s16(filter[y0_q4]); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); assert(!((intptr_t)dst & 3)); @@ -620,10 +636,10 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); @@ -698,10 +714,11 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride, __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + d3 = + highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); vst1q_u16(d, d0); d += dst_stride; @@ -732,15 +749,15 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride, void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { if (y_step_q4 != 16) { - vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h, bd); + vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); } else { - const int16x8_t filters = vld1q_s16(filter_y); + const int16x8_t filters = vld1q_s16(filter[y0_q4]); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); assert(!((intptr_t)dst & 3)); @@ -786,10 +803,10 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src, __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); @@ -872,10 +889,11 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src, __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + t3 = + highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); d0 = vld1q_u16(d + 0 * dst_stride); d1 = vld1q_u16(d + 1 * dst_stride); diff --git a/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c b/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c index 4ff3dea08..765a054f8 100644 --- a/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c +++ b/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c @@ -15,13 +15,14 @@ void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; (void)bd; if (w < 8) { // avg4 diff --git a/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c index 61712d48e..9d2752e09 100644 --- a/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c +++ b/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c @@ -15,13 +15,14 @@ void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; (void)bd; if (w < 8) { // copy4 diff --git a/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c index f769620a4..414ade353 100644 --- a/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c +++ b/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c @@ -15,12 +15,11 @@ void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y)); // + 1 to make it divisible by 4 - DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]); + uint16_t temp[64 * 136]; const int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; @@ -29,22 +28,21 @@ void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, * buffer which has lots of extra room and is subsequently discarded this is * safe if somewhat less than ideal. */ vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, - filter_x, x_step_q4, filter_y, y_step_q4, w, + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, intermediate_height, bd); /* Step into the temp buffer 3 lines to get the actual frame data */ - vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h, bd); + vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); } void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y)); // + 1 to make it divisible by 4 - DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]); + uint16_t temp[64 * 136]; const int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; @@ -52,8 +50,9 @@ void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride, * to average the values after both passes. */ vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, - filter_x, x_step_q4, filter_y, y_step_q4, w, + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, intermediate_height, bd); - vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h, bd); + vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); } diff --git a/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c index 968bc5cc3..bf5192a68 100644 --- a/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c +++ b/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c @@ -32,7 +32,8 @@ static INLINE void idct16x16_1_add_neg_kernel(uint8_t **dest, const int stride, void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { - const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + const int16_t out0 = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); diff --git a/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c index 604d82abd..8920b9336 100644 --- a/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c +++ b/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c @@ -39,7 +39,8 @@ static INLINE void idct32x32_1_add_neg_kernel(uint8_t **dest, const int stride, void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { int i; - const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + const int16_t out0 = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); diff --git a/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c index 21d21b033..a14b89543 100644 --- a/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c +++ b/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c @@ -32,7 +32,8 @@ static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride, void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { - const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + const int16_t out0 = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4); const int16x8_t dc = vdupq_n_s16(a1); diff --git a/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c index 7bcce913b..ce9b45958 100644 --- a/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c +++ b/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c @@ -36,7 +36,8 @@ static INLINE void idct8x8_1_add_neg_kernel(uint8_t **dest, const int stride, void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { - const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + const int16_t out0 = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5); diff --git a/libvpx/vpx_dsp/arm/idct_neon.h b/libvpx/vpx_dsp/arm/idct_neon.h index 0fc1de8e4..6ed02af5a 100644 --- a/libvpx/vpx_dsp/arm/idct_neon.h +++ b/libvpx/vpx_dsp/arm/idct_neon.h @@ -18,7 +18,7 @@ #include "vpx_dsp/txfm_common.h" #include "vpx_dsp/vpx_dsp_common.h" -DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = { +static const int16_t kCospi[16] = { 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */, 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */, 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */, @@ -29,7 +29,7 @@ DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = { 12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */ }; -DECLARE_ALIGNED(16, static const int32_t, kCospi32[16]) = { +static const int32_t kCospi32[16] = { 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */, 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */, 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */, diff --git a/libvpx/vpx_dsp/arm/mem_neon.h b/libvpx/vpx_dsp/arm/mem_neon.h index 37b89b276..4efad5333 100644 --- a/libvpx/vpx_dsp/arm/mem_neon.h +++ b/libvpx/vpx_dsp/arm/mem_neon.h @@ -79,6 +79,32 @@ static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) { memcpy(buf, &a, 4); } +// Load 2 sets of 4 bytes when alignment is not guaranteed. +static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) { + uint32_t a; + uint32x2_t a_u32 = vdup_n_u32(0); + if (stride == 4) return vld1_u8(buf); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vld1_lane_u32(&a, a_u32, 0); + memcpy(&a, buf, 4); + a_u32 = vld1_lane_u32(&a, a_u32, 1); + return vreinterpret_u8_u32(a_u32); +} + +// Store 2 sets of 4 bytes when alignment is not guaranteed. +static INLINE void store_unaligned_u8(uint8_t *buf, int stride, + const uint8x8_t a) { + const uint32x2_t a_u32 = vreinterpret_u32_u8(a); + if (stride == 4) { + vst1_u8(buf, a); + return; + } + uint32_to_mem(buf, vget_lane_u32(a_u32, 0)); + buf += stride; + uint32_to_mem(buf, vget_lane_u32(a_u32, 1)); +} + // Load 4 sets of 4 bytes when alignment is not guaranteed. static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) { uint32_t a; diff --git a/libvpx/vpx_dsp/arm/quantize_neon.c b/libvpx/vpx_dsp/arm/quantize_neon.c new file mode 100644 index 000000000..a0a1e6dd5 --- /dev/null +++ b/libvpx/vpx_dsp/arm/quantize_neon.c @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" + +void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, + const int16_t *iscan_ptr) { + const int16x8_t one = vdupq_n_s16(1); + const int16x8_t neg_one = vdupq_n_s16(-1); + uint16x8_t eob_max; + (void)scan_ptr; + (void)skip_block; + assert(!skip_block); + + // Process first 8 values which include a dc component. + { + // Only the first element of each vector is DC. + const int16x8_t zbin = vld1q_s16(zbin_ptr); + const int16x8_t round = vld1q_s16(round_ptr); + const int16x8_t quant = vld1q_s16(quant_ptr); + const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); + const int16x8_t dequant = vld1q_s16(dequant_ptr); + // Add one because the eob does not index from 0. + const uint16x8_t iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + + const int16x8_t zbin_mask = + vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + + const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + + // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 + int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); + + qcoeff = vaddq_s16(qcoeff, rounded); + + // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16 + qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1); + + // Restore the sign bit. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + + qcoeff = vandq_s16(qcoeff, zbin_mask); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan); + + coeff_ptr += 8; + iscan_ptr += 8; + + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + qcoeff_ptr += 8; + + qcoeff = vmulq_s16(qcoeff, dequant); + + store_s16q_to_tran_low(dqcoeff_ptr, qcoeff); + dqcoeff_ptr += 8; + } + + n_coeffs -= 8; + + { + const int16x8_t zbin = vdupq_n_s16(zbin_ptr[1]); + const int16x8_t round = vdupq_n_s16(round_ptr[1]); + const int16x8_t quant = vdupq_n_s16(quant_ptr[1]); + const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]); + const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]); + + do { + // Add one because the eob is not its index. + const uint16x8_t iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + + const int16x8_t zbin_mask = + vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + + const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + + // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 + int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); + + qcoeff = vaddq_s16(qcoeff, rounded); + + // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16 + qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1); + + // Restore the sign bit. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + + qcoeff = vandq_s16(qcoeff, zbin_mask); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan)); + + coeff_ptr += 8; + iscan_ptr += 8; + + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + qcoeff_ptr += 8; + + qcoeff = vmulq_s16(qcoeff, dequant); + + store_s16q_to_tran_low(dqcoeff_ptr, qcoeff); + dqcoeff_ptr += 8; + + n_coeffs -= 8; + } while (n_coeffs > 0); + } + + { + const uint16x4_t eob_max_0 = + vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); + const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); + const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); + vst1_lane_u16(eob_ptr, eob_max_2, 0); + } +} + +static INLINE int32x4_t extract_sign_bit(int32x4_t a) { + return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31)); +} + +// Main difference is that zbin values are halved before comparison and dqcoeff +// values are divided by 2. zbin is rounded but dqcoeff is not. +void vpx_quantize_b_32x32_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + const int16x8_t one = vdupq_n_s16(1); + const int16x8_t neg_one = vdupq_n_s16(-1); + uint16x8_t eob_max; + int i; + (void)scan_ptr; + (void)n_coeffs; // Because we will always calculate 32*32. + (void)skip_block; + assert(!skip_block); + + // Process first 8 values which include a dc component. + { + // Only the first element of each vector is DC. + const int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1); + const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); + const int16x8_t quant = vld1q_s16(quant_ptr); + const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); + const int16x8_t dequant = vld1q_s16(dequant_ptr); + // Add one because the eob does not index from 0. + const uint16x8_t iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + + const int16x8_t zbin_mask = + vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + + const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + + // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 + int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); + int16x8_t dqcoeff; + int32x4_t dqcoeff_0, dqcoeff_1; + + qcoeff = vaddq_s16(qcoeff, rounded); + + // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15 + qcoeff = vqdmulhq_s16(qcoeff, quant_shift); + + // Restore the sign bit. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + + qcoeff = vandq_s16(qcoeff, zbin_mask); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan); + + coeff_ptr += 8; + iscan_ptr += 8; + + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + qcoeff_ptr += 8; + + dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); + dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); + + // Add 1 if negative to round towards zero because the C uses division. + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + + dqcoeff = + vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); + + store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); + dqcoeff_ptr += 8; + } + + { + const int16x8_t zbin = vrshrq_n_s16(vdupq_n_s16(zbin_ptr[1]), 1); + const int16x8_t round = vrshrq_n_s16(vdupq_n_s16(round_ptr[1]), 1); + const int16x8_t quant = vdupq_n_s16(quant_ptr[1]); + const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]); + const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]); + + for (i = 1; i < 32 * 32 / 8; ++i) { + // Add one because the eob is not its index. + const uint16x8_t iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + + const int16x8_t zbin_mask = + vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + + const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + + // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 + int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); + int16x8_t dqcoeff; + int32x4_t dqcoeff_0, dqcoeff_1; + + qcoeff = vaddq_s16(qcoeff, rounded); + + // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15 + qcoeff = vqdmulhq_s16(qcoeff, quant_shift); + + // Restore the sign bit. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + + qcoeff = vandq_s16(qcoeff, zbin_mask); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan)); + + coeff_ptr += 8; + iscan_ptr += 8; + + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + qcoeff_ptr += 8; + + dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); + dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); + + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + + dqcoeff = + vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); + + store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); + dqcoeff_ptr += 8; + } + } + + { + const uint16x4_t eob_max_0 = + vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); + const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); + const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); + vst1_lane_u16(eob_ptr, eob_max_2, 0); + } +} diff --git a/libvpx/vpx_dsp/arm/sad4d_neon.c b/libvpx/vpx_dsp/arm/sad4d_neon.c index dc2039800..b04de3aff 100644 --- a/libvpx/vpx_dsp/arm/sad4d_neon.c +++ b/libvpx/vpx_dsp/arm/sad4d_neon.c @@ -13,212 +13,230 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" -static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, - const uint16x8_t vec_hi) { - const uint32x4_t vec_l_lo = - vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); - const uint32x4_t vec_l_hi = - vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); - const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} - -// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16, -// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo -// and vec_sum_ref_hi. -static void sad_neon_64(const uint8x16_t vec_src_00, - const uint8x16_t vec_src_16, - const uint8x16_t vec_src_32, - const uint8x16_t vec_src_48, const uint8_t *ref, - uint16x8_t *vec_sum_ref_lo, - uint16x8_t *vec_sum_ref_hi) { - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); - const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); - - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32), - vget_low_u8(vec_ref_32)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32), - vget_high_u8(vec_ref_32)); - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48), - vget_low_u8(vec_ref_48)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48), - vget_high_u8(vec_ref_48)); -} - -// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16, -// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi. -static void sad_neon_32(const uint8x16_t vec_src_00, - const uint8x16_t vec_src_16, const uint8_t *ref, - uint16x8_t *vec_sum_ref_lo, - uint16x8_t *vec_sum_ref_hi) { - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); +void vpx_sad4x4x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + int i; + const uint8x16_t src_u8 = load_unaligned_u8q(src, src_stride); + for (i = 0; i < 4; ++i) { + const uint8x16_t ref_u8 = load_unaligned_u8q(ref[i], ref_stride); + uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8)); + abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); + res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0); + } } -void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride, +void vpx_sad4x8x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + int i; + const uint8x16_t src_0 = load_unaligned_u8q(src, src_stride); + const uint8x16_t src_1 = load_unaligned_u8q(src + 4 * src_stride, src_stride); + for (i = 0; i < 4; ++i) { + const uint8x16_t ref_0 = load_unaligned_u8q(ref[i], ref_stride); + const uint8x16_t ref_1 = + load_unaligned_u8q(ref[i] + 4 * ref_stride, ref_stride); + uint16x8_t abs = vabdl_u8(vget_low_u8(src_0), vget_low_u8(ref_0)); + abs = vabal_u8(abs, vget_high_u8(src_0), vget_high_u8(ref_0)); + abs = vabal_u8(abs, vget_low_u8(src_1), vget_low_u8(ref_1)); + abs = vabal_u8(abs, vget_high_u8(src_1), vget_high_u8(ref_1)); + res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0); + } +} + +static INLINE void sad8x_4d(const uint8_t *a, int a_stride, + const uint8_t *const b[4], int b_stride, + uint32_t *result, const int height) { + int i, j; + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; + + for (i = 0; i < height; ++i) { + const uint8x8_t a_u8 = vld1_u8(a); + a += a_stride; + for (j = 0; j < 4; ++j) { + const uint8x8_t b_u8 = vld1_u8(b_loop[j]); + b_loop[j] += b_stride; + sum[j] = vabal_u8(sum[j], a_u8, b_u8); + } + } + + for (j = 0; j < 4; ++j) { + result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); + } +} + +void vpx_sad8x4x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad8x_4d(src, src_stride, ref, ref_stride, res, 4); +} + +void vpx_sad8x8x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad8x_4d(src, src_stride, ref, ref_stride, res, 8); +} + +void vpx_sad8x16x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad8x_4d(src, src_stride, ref, ref_stride, res, 16); +} + +static INLINE void sad16x_4d(const uint8_t *a, int a_stride, + const uint8_t *const b[4], int b_stride, + uint32_t *result, const int height) { + int i, j; + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; + + for (i = 0; i < height; ++i) { + const uint8x16_t a_u8 = vld1q_u8(a); + a += a_stride; + for (j = 0; j < 4; ++j) { + const uint8x16_t b_u8 = vld1q_u8(b_loop[j]); + b_loop[j] += b_stride; + sum[j] = vabal_u8(sum[j], vget_low_u8(a_u8), vget_low_u8(b_u8)); + sum[j] = vabal_u8(sum[j], vget_high_u8(a_u8), vget_high_u8(b_u8)); + } + } + + for (j = 0; j < 4; ++j) { + result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); + } +} + +void vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad16x_4d(src, src_stride, ref, ref_stride, res, 8); +} + +void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t *res) { - int i; - uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); - const uint8_t *ref0, *ref1, *ref2, *ref3; - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; - - for (i = 0; i < 64; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - const uint8x16_t vec_src_32 = vld1q_u8(src + 32); - const uint8x16_t vec_src_48 = vld1q_u8(src + 48); - - sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0, - &vec_sum_ref0_lo, &vec_sum_ref0_hi); - sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1, - &vec_sum_ref1_lo, &vec_sum_ref1_hi); - sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2, - &vec_sum_ref2_lo, &vec_sum_ref2_hi); - sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3, - &vec_sum_ref3_lo, &vec_sum_ref3_hi); - - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; + sad16x_4d(src, src_stride, ref, ref_stride, res, 16); +} + +void vpx_sad16x32x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad16x_4d(src, src_stride, ref, ref_stride, res, 32); +} + +static INLINE void sad32x_4d(const uint8_t *a, int a_stride, + const uint8_t *const b[4], int b_stride, + uint32_t *result, const int height) { + int i, j; + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; + + for (i = 0; i < height; ++i) { + const uint8x16_t a_0 = vld1q_u8(a); + const uint8x16_t a_1 = vld1q_u8(a + 16); + a += a_stride; + for (j = 0; j < 4; ++j) { + const uint8x16_t b_0 = vld1q_u8(b_loop[j]); + const uint8x16_t b_1 = vld1q_u8(b_loop[j] + 16); + b_loop[j] += b_stride; + sum[j] = vabal_u8(sum[j], vget_low_u8(a_0), vget_low_u8(b_0)); + sum[j] = vabal_u8(sum[j], vget_high_u8(a_0), vget_high_u8(b_0)); + sum[j] = vabal_u8(sum[j], vget_low_u8(a_1), vget_low_u8(b_1)); + sum[j] = vabal_u8(sum[j], vget_high_u8(a_1), vget_high_u8(b_1)); + } } - res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); - res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); - res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); - res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); + for (j = 0; j < 4; ++j) { + result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); + } +} + +void vpx_sad32x16x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad32x_4d(src, src_stride, ref, ref_stride, res, 16); } void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t *res) { + sad32x_4d(src, src_stride, ref, ref_stride, res, 32); +} + +void vpx_sad32x64x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad32x_4d(src, src_stride, ref, ref_stride, res, 64); +} + +static INLINE void sum64x(const uint8x16_t a_0, const uint8x16_t a_1, + const uint8x16_t b_0, const uint8x16_t b_1, + uint16x8_t *sum) { + *sum = vabal_u8(*sum, vget_low_u8(a_0), vget_low_u8(b_0)); + *sum = vabal_u8(*sum, vget_high_u8(a_0), vget_high_u8(b_0)); + *sum = vabal_u8(*sum, vget_low_u8(a_1), vget_low_u8(b_1)); + *sum = vabal_u8(*sum, vget_high_u8(a_1), vget_high_u8(b_1)); +} + +static INLINE void sad64x_4d(const uint8_t *a, int a_stride, + const uint8_t *const b[4], int b_stride, + uint32_t *result, const int height) { int i; - uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); - const uint8_t *ref0, *ref1, *ref2, *ref3; - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; - - for (i = 0; i < 32; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - - sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo, - &vec_sum_ref0_hi); - sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo, - &vec_sum_ref1_hi); - sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo, - &vec_sum_ref2_hi); - sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo, - &vec_sum_ref3_hi); - - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; + uint16x8_t sum_0 = vdupq_n_u16(0); + uint16x8_t sum_1 = vdupq_n_u16(0); + uint16x8_t sum_2 = vdupq_n_u16(0); + uint16x8_t sum_3 = vdupq_n_u16(0); + uint16x8_t sum_4 = vdupq_n_u16(0); + uint16x8_t sum_5 = vdupq_n_u16(0); + uint16x8_t sum_6 = vdupq_n_u16(0); + uint16x8_t sum_7 = vdupq_n_u16(0); + const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; + + for (i = 0; i < height; ++i) { + const uint8x16_t a_0 = vld1q_u8(a); + const uint8x16_t a_1 = vld1q_u8(a + 16); + const uint8x16_t a_2 = vld1q_u8(a + 32); + const uint8x16_t a_3 = vld1q_u8(a + 48); + a += a_stride; + sum64x(a_0, a_1, vld1q_u8(b_loop[0]), vld1q_u8(b_loop[0] + 16), &sum_0); + sum64x(a_2, a_3, vld1q_u8(b_loop[0] + 32), vld1q_u8(b_loop[0] + 48), + &sum_1); + b_loop[0] += b_stride; + sum64x(a_0, a_1, vld1q_u8(b_loop[1]), vld1q_u8(b_loop[1] + 16), &sum_2); + sum64x(a_2, a_3, vld1q_u8(b_loop[1] + 32), vld1q_u8(b_loop[1] + 48), + &sum_3); + b_loop[1] += b_stride; + sum64x(a_0, a_1, vld1q_u8(b_loop[2]), vld1q_u8(b_loop[2] + 16), &sum_4); + sum64x(a_2, a_3, vld1q_u8(b_loop[2] + 32), vld1q_u8(b_loop[2] + 48), + &sum_5); + b_loop[2] += b_stride; + sum64x(a_0, a_1, vld1q_u8(b_loop[3]), vld1q_u8(b_loop[3] + 16), &sum_6); + sum64x(a_2, a_3, vld1q_u8(b_loop[3] + 32), vld1q_u8(b_loop[3] + 48), + &sum_7); + b_loop[3] += b_stride; } - res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); - res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); - res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); - res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); + result[0] = vget_lane_u32(horizontal_add_long_uint16x8(sum_0, sum_1), 0); + result[1] = vget_lane_u32(horizontal_add_long_uint16x8(sum_2, sum_3), 0); + result[2] = vget_lane_u32(horizontal_add_long_uint16x8(sum_4, sum_5), 0); + result[3] = vget_lane_u32(horizontal_add_long_uint16x8(sum_6, sum_7), 0); } -void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride, +void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t *res) { - int i; - uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); - const uint8_t *ref0, *ref1, *ref2, *ref3; - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; - - for (i = 0; i < 16; ++i) { - const uint8x16_t vec_src = vld1q_u8(src); - const uint8x16_t vec_ref0 = vld1q_u8(ref0); - const uint8x16_t vec_ref1 = vld1q_u8(ref1); - const uint8x16_t vec_ref2 = vld1q_u8(ref2); - const uint8x16_t vec_ref3 = vld1q_u8(ref3); - - vec_sum_ref0_lo = - vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0)); - vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref0)); - vec_sum_ref1_lo = - vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1)); - vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref1)); - vec_sum_ref2_lo = - vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2)); - vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref2)); - vec_sum_ref3_lo = - vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3)); - vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref3)); - - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; - } + sad64x_4d(src, src_stride, ref, ref_stride, res, 32); +} - res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); - res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); - res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); - res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); +void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad64x_4d(src, src_stride, ref, ref_stride, res, 64); } diff --git a/libvpx/vpx_dsp/arm/sad_neon.c b/libvpx/vpx_dsp/arm/sad_neon.c index ff3228768..9518a166b 100644 --- a/libvpx/vpx_dsp/arm/sad_neon.c +++ b/libvpx/vpx_dsp/arm/sad_neon.c @@ -13,211 +13,332 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" -unsigned int vpx_sad8x16_neon(unsigned char *src_ptr, int src_stride, - unsigned char *ref_ptr, int ref_stride) { - uint8x8_t d0, d8; - uint16x8_t q12; - uint32x4_t q1; - uint64x2_t q3; - uint32x2_t d5; +uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); + uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8)); + abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); +} + +uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); + const uint8x16_t second_pred_u8 = vld1q_u8(second_pred); + const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); + uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg)); + abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); +} + +uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { int i; + uint16x8_t abs = vdupq_n_u16(0); + for (i = 0; i < 8; i += 4) { + const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); + src_ptr += 4 * src_stride; + ref_ptr += 4 * ref_stride; + abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(ref_u8)); + abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); + } - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(d0, d8); + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); +} - for (i = 0; i < 15; i++) { - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, d0, d8); +uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + int i; + uint16x8_t abs = vdupq_n_u16(0); + for (i = 0; i < 8; i += 4) { + const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); + const uint8x16_t second_pred_u8 = vld1q_u8(second_pred); + const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); + src_ptr += 4 * src_stride; + ref_ptr += 4 * ref_stride; + second_pred += 16; + abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(avg)); + abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); } - q1 = vpaddlq_u16(q12); - q3 = vpaddlq_u32(q1); - d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), - vreinterpret_u32_u64(vget_high_u64(q3))); + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); +} + +static INLINE uint16x8_t sad8x(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, const int height) { + int i; + uint16x8_t abs = vdupq_n_u16(0); - return vget_lane_u32(d5, 0); + for (i = 0; i < height; ++i) { + const uint8x8_t a_u8 = vld1_u8(a); + const uint8x8_t b_u8 = vld1_u8(b); + a += a_stride; + b += b_stride; + abs = vabal_u8(abs, a_u8, b_u8); + } + return abs; } -unsigned int vpx_sad4x4_neon(unsigned char *src_ptr, int src_stride, - unsigned char *ref_ptr, int ref_stride) { - uint8x8_t d0, d8; - uint16x8_t q12; - uint32x2_t d1; - uint64x1_t d3; +static INLINE uint16x8_t sad8x_avg(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *c, const int height) { int i; + uint16x8_t abs = vdupq_n_u16(0); - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(d0, d8); + for (i = 0; i < height; ++i) { + const uint8x8_t a_u8 = vld1_u8(a); + const uint8x8_t b_u8 = vld1_u8(b); + const uint8x8_t c_u8 = vld1_u8(c); + const uint8x8_t avg = vrhadd_u8(b_u8, c_u8); + a += a_stride; + b += b_stride; + c += 8; + abs = vabal_u8(abs, a_u8, avg); + } + return abs; +} - for (i = 0; i < 3; i++) { - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, d0, d8); +#define sad8xN(n) \ + uint32_t vpx_sad8x##n##_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + } \ + \ + uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint16x8_t abs = \ + sad8x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ } - d1 = vpaddl_u16(vget_low_u16(q12)); - d3 = vpaddl_u32(d1); +sad8xN(4); +sad8xN(8); +sad8xN(16); + +static INLINE uint16x8_t sad16x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const int height) { + int i; + uint16x8_t abs = vdupq_n_u16(0); - return vget_lane_u32(vreinterpret_u32_u64(d3), 0); + for (i = 0; i < height; ++i) { + const uint8x16_t a_u8 = vld1q_u8(a); + const uint8x16_t b_u8 = vld1q_u8(b); + a += a_stride; + b += b_stride; + abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8)); + abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8)); + } + return abs; } -unsigned int vpx_sad16x8_neon(unsigned char *src_ptr, int src_stride, - unsigned char *ref_ptr, int ref_stride) { - uint8x16_t q0, q4; - uint16x8_t q12, q13; - uint32x4_t q1; - uint64x2_t q3; - uint32x2_t d5; +static INLINE uint16x8_t sad16x_avg(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *c, const int height) { int i; + uint16x8_t abs = vdupq_n_u16(0); - q0 = vld1q_u8(src_ptr); - src_ptr += src_stride; - q4 = vld1q_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); - q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); - - for (i = 0; i < 7; i++) { - q0 = vld1q_u8(src_ptr); - src_ptr += src_stride; - q4 = vld1q_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); - q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); - } - - q12 = vaddq_u16(q12, q13); - q1 = vpaddlq_u16(q12); - q3 = vpaddlq_u32(q1); - d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), - vreinterpret_u32_u64(vget_high_u64(q3))); - - return vget_lane_u32(d5, 0); + for (i = 0; i < height; ++i) { + const uint8x16_t a_u8 = vld1q_u8(a); + const uint8x16_t b_u8 = vld1q_u8(b); + const uint8x16_t c_u8 = vld1q_u8(c); + const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8); + a += a_stride; + b += b_stride; + c += 16; + abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg)); + abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg)); + } + return abs; } -static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, - const uint16x8_t vec_hi) { - const uint32x4_t vec_l_lo = - vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); - const uint32x4_t vec_l_hi = - vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); - const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} -static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { - const uint32x4_t a = vpaddlq_u16(vec_16x8); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} +#define sad16xN(n) \ + uint32_t vpx_sad16x##n##_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + const uint16x8_t abs = sad16x(src, src_stride, ref, ref_stride, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + } \ + \ + uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint16x8_t abs = \ + sad16x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + } + +sad16xN(8); +sad16xN(16); +sad16xN(32); -unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { +static INLINE uint16x8_t sad32x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const int height) { int i; - uint16x8_t vec_accum_lo = vdupq_n_u16(0); - uint16x8_t vec_accum_hi = vdupq_n_u16(0); - for (i = 0; i < 64; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - const uint8x16_t vec_src_32 = vld1q_u8(src + 32); - const uint8x16_t vec_src_48 = vld1q_u8(src + 48); - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); - const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); - src += src_stride; - ref += ref_stride; - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32), - vget_low_u8(vec_ref_32)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), - vget_high_u8(vec_ref_32)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), - vget_low_u8(vec_ref_48)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), - vget_high_u8(vec_ref_48)); - } - return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); + uint16x8_t abs = vdupq_n_u16(0); + + for (i = 0; i < height; ++i) { + const uint8x16_t a_lo = vld1q_u8(a); + const uint8x16_t a_hi = vld1q_u8(a + 16); + const uint8x16_t b_lo = vld1q_u8(b); + const uint8x16_t b_hi = vld1q_u8(b + 16); + a += a_stride; + b += b_stride; + abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo)); + abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo)); + abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi)); + abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(b_hi)); + } + return abs; } -unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { +static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *c, const int height) { int i; - uint16x8_t vec_accum_lo = vdupq_n_u16(0); - uint16x8_t vec_accum_hi = vdupq_n_u16(0); - - for (i = 0; i < 32; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - src += src_stride; - ref += ref_stride; - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); - } - return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); + uint16x8_t abs = vdupq_n_u16(0); + + for (i = 0; i < height; ++i) { + const uint8x16_t a_lo = vld1q_u8(a); + const uint8x16_t a_hi = vld1q_u8(a + 16); + const uint8x16_t b_lo = vld1q_u8(b); + const uint8x16_t b_hi = vld1q_u8(b + 16); + const uint8x16_t c_lo = vld1q_u8(c); + const uint8x16_t c_hi = vld1q_u8(c + 16); + const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo); + const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi); + a += a_stride; + b += b_stride; + c += 32; + abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo)); + abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo)); + abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi)); + abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(avg_hi)); + } + return abs; } -unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { +#define sad32xN(n) \ + uint32_t vpx_sad32x##n##_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + const uint16x8_t abs = sad32x(src, src_stride, ref, ref_stride, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + } \ + \ + uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint16x8_t abs = \ + sad32x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + } + +sad32xN(16); +sad32xN(32); +sad32xN(64); + +static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const int height) { int i; - uint16x8_t vec_accum_lo = vdupq_n_u16(0); - uint16x8_t vec_accum_hi = vdupq_n_u16(0); - - for (i = 0; i < 16; ++i) { - const uint8x16_t vec_src = vld1q_u8(src); - const uint8x16_t vec_ref = vld1q_u8(ref); - src += src_stride; - ref += ref_stride; - vec_accum_lo = - vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref)); - vec_accum_hi = - vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref)); - } - return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); + uint16x8_t abs_0 = vdupq_n_u16(0); + uint16x8_t abs_1 = vdupq_n_u16(0); + + for (i = 0; i < height; ++i) { + const uint8x16_t a_0 = vld1q_u8(a); + const uint8x16_t a_1 = vld1q_u8(a + 16); + const uint8x16_t a_2 = vld1q_u8(a + 32); + const uint8x16_t a_3 = vld1q_u8(a + 48); + const uint8x16_t b_0 = vld1q_u8(b); + const uint8x16_t b_1 = vld1q_u8(b + 16); + const uint8x16_t b_2 = vld1q_u8(b + 32); + const uint8x16_t b_3 = vld1q_u8(b + 48); + a += a_stride; + b += b_stride; + abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0)); + abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0)); + abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1)); + abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(b_1)); + abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(b_2)); + abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(b_2)); + abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(b_3)); + abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(b_3)); + } + + { + const uint32x4_t sum = vpaddlq_u16(abs_0); + return vpadalq_u16(sum, abs_1); + } } -unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { +static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *c, const int height) { int i; - uint16x8_t vec_accum = vdupq_n_u16(0); + uint16x8_t abs_0 = vdupq_n_u16(0); + uint16x8_t abs_1 = vdupq_n_u16(0); - for (i = 0; i < 8; ++i) { - const uint8x8_t vec_src = vld1_u8(src); - const uint8x8_t vec_ref = vld1_u8(ref); - src += src_stride; - ref += ref_stride; - vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); + for (i = 0; i < height; ++i) { + const uint8x16_t a_0 = vld1q_u8(a); + const uint8x16_t a_1 = vld1q_u8(a + 16); + const uint8x16_t a_2 = vld1q_u8(a + 32); + const uint8x16_t a_3 = vld1q_u8(a + 48); + const uint8x16_t b_0 = vld1q_u8(b); + const uint8x16_t b_1 = vld1q_u8(b + 16); + const uint8x16_t b_2 = vld1q_u8(b + 32); + const uint8x16_t b_3 = vld1q_u8(b + 48); + const uint8x16_t c_0 = vld1q_u8(c); + const uint8x16_t c_1 = vld1q_u8(c + 16); + const uint8x16_t c_2 = vld1q_u8(c + 32); + const uint8x16_t c_3 = vld1q_u8(c + 48); + const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0); + const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1); + const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2); + const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3); + a += a_stride; + b += b_stride; + c += 64; + abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0)); + abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0)); + abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1)); + abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(avg_1)); + abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(avg_2)); + abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(avg_2)); + abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(avg_3)); + abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(avg_3)); + } + + { + const uint32x4_t sum = vpaddlq_u16(abs_0); + return vpadalq_u16(sum, abs_1); } - return horizontal_add_16x8(vec_accum); } + +#define sad64xN(n) \ + uint32_t vpx_sad64x##n##_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + const uint32x4_t abs = sad64x(src, src_stride, ref, ref_stride, n); \ + return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \ + } \ + \ + uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint32x4_t abs = \ + sad64x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ + return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \ + } + +sad64xN(32); +sad64xN(64); diff --git a/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/libvpx/vpx_dsp/arm/subpel_variance_neon.c index 9b1622ff0..4f58a7832 100644 --- a/libvpx/vpx_dsp/arm/subpel_variance_neon.c +++ b/libvpx/vpx_dsp/arm/subpel_variance_neon.c @@ -12,16 +12,39 @@ #include "./vpx_dsp_rtcd.h" #include "./vpx_config.h" -#include "vpx_ports/mem.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/variance.h" +#include "vpx_dsp/arm/mem_neon.h" static const uint8_t bilinear_filters[8][2] = { { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, }; +// Process a block exactly 4 wide and a multiple of 2 high. +static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + const uint8_t *filter) { + const uint8x8_t f0 = vdup_n_u8(filter[0]); + const uint8x8_t f1 = vdup_n_u8(filter[1]); + unsigned int i; + for (i = 0; i < output_height; i += 2) { + const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line); + const uint8x8_t src_1 = + load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line); + const uint16x8_t a = vmull_u8(src_0, f0); + const uint16x8_t b = vmlal_u8(a, src_1, f1); + const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); + vst1_u8(output_ptr, out); + src_ptr += 2 * src_pixels_per_line; + output_ptr += 8; + } +} + // Process a block exactly 8 wide and any height. static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *output_ptr, @@ -29,8 +52,8 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, int pixel_step, unsigned int output_height, const uint8_t *filter) { - const uint8x8_t f0 = vmov_n_u8(filter[0]); - const uint8x8_t f1 = vmov_n_u8(filter[1]); + const uint8x8_t f0 = vdup_n_u8(filter[0]); + const uint8x8_t f1 = vdup_n_u8(filter[1]); unsigned int i; for (i = 0; i < output_height; ++i) { const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); @@ -38,8 +61,7 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, const uint16x8_t a = vmull_u8(src_0, f0); const uint16x8_t b = vmlal_u8(a, src_1, f1); const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); - vst1_u8(&output_ptr[0], out); - // Next row... + vst1_u8(output_ptr, out); src_ptr += src_pixels_per_line; output_ptr += 8; } @@ -53,8 +75,8 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, unsigned int output_height, unsigned int output_width, const uint8_t *filter) { - const uint8x8_t f0 = vmov_n_u8(filter[0]); - const uint8x8_t f1 = vmov_n_u8(filter[1]); + const uint8x8_t f0 = vdup_n_u8(filter[0]); + const uint8x8_t f1 = vdup_n_u8(filter[1]); unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; j += 16) { @@ -66,36 +88,43 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); - vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi)); + vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi)); } - // Next row... src_ptr += src_pixels_per_line; output_ptr += output_width; } } -// TODO(johannkoenig): support 4xM block sizes. -#define sub_pixel_varianceNxM(n, m) \ - unsigned int vpx_sub_pixel_variance##n##x##m##_neon( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, unsigned int *sse) { \ - DECLARE_ALIGNED(16, uint8_t, fdata3[n * (m + 1)]); \ - DECLARE_ALIGNED(16, uint8_t, temp2[n * m]); \ - \ - if (n == 8) { \ - var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, (m + 1), \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_w8(fdata3, temp2, n, n, m, \ - bilinear_filters[yoffset]); \ - } else { \ - var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, (m + 1), n, \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_w16(fdata3, temp2, n, n, m, n, \ - bilinear_filters[yoffset]); \ - } \ - return vpx_variance##n##x##m(temp2, n, dst, dst_stride, sse); \ +// 4xM filter writes an extra row to fdata because it processes two rows at a +// time. +#define sub_pixel_varianceNxM(n, m) \ + uint32_t vpx_sub_pixel_variance##n##x##m##_neon( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ + uint8_t temp1[n * m]; \ + \ + if (n == 4) { \ + var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2), \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ + bilinear_filters[yoffset]); \ + } else if (n == 8) { \ + var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1), \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ + bilinear_filters[yoffset]); \ + } else { \ + var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ + bilinear_filters[yoffset]); \ + } \ + return vpx_variance##n##x##m(temp1, n, b, b_stride, sse); \ } +sub_pixel_varianceNxM(4, 4); +sub_pixel_varianceNxM(4, 8); sub_pixel_varianceNxM(8, 4); sub_pixel_varianceNxM(8, 8); sub_pixel_varianceNxM(8, 16); @@ -107,3 +136,49 @@ sub_pixel_varianceNxM(32, 32); sub_pixel_varianceNxM(32, 64); sub_pixel_varianceNxM(64, 32); sub_pixel_varianceNxM(64, 64); + +// 4xM filter writes an extra row to fdata because it processes two rows at a +// time. +#define sub_pixel_avg_varianceNxM(n, m) \ + uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ + uint8_t temp1[n * m]; \ + \ + if (n == 4) { \ + var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2), \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ + bilinear_filters[yoffset]); \ + } else if (n == 8) { \ + var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1), \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ + bilinear_filters[yoffset]); \ + } else { \ + var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ + bilinear_filters[yoffset]); \ + } \ + \ + vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n); \ + \ + return vpx_variance##n##x##m(temp0, n, b, b_stride, sse); \ + } + +sub_pixel_avg_varianceNxM(4, 4); +sub_pixel_avg_varianceNxM(4, 8); +sub_pixel_avg_varianceNxM(8, 4); +sub_pixel_avg_varianceNxM(8, 8); +sub_pixel_avg_varianceNxM(8, 16); +sub_pixel_avg_varianceNxM(16, 8); +sub_pixel_avg_varianceNxM(16, 16); +sub_pixel_avg_varianceNxM(16, 32); +sub_pixel_avg_varianceNxM(32, 16); +sub_pixel_avg_varianceNxM(32, 32); +sub_pixel_avg_varianceNxM(32, 64); +sub_pixel_avg_varianceNxM(64, 32); +sub_pixel_avg_varianceNxM(64, 64); diff --git a/libvpx/vpx_dsp/arm/sum_neon.h b/libvpx/vpx_dsp/arm/sum_neon.h new file mode 100644 index 000000000..d74fe0cde --- /dev/null +++ b/libvpx/vpx_dsp/arm/sum_neon.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_ARM_SUM_NEON_H_ +#define VPX_DSP_ARM_SUM_NEON_H_ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +static INLINE int32x2_t horizontal_add_int16x8(const int16x8_t a) { + const int32x4_t b = vpaddlq_s16(a); + const int64x2_t c = vpaddlq_s32(b); + return vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)), + vreinterpret_s32_s64(vget_high_s64(c))); +} + +static INLINE uint32x2_t horizontal_add_uint16x8(const uint16x8_t a) { + const uint32x4_t b = vpaddlq_u16(a); + const uint64x2_t c = vpaddlq_u32(b); + return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), + vreinterpret_u32_u64(vget_high_u64(c))); +} + +static INLINE uint32x2_t horizontal_add_long_uint16x8(const uint16x8_t a, + const uint16x8_t b) { + const uint32x4_t c = vpaddlq_u16(a); + const uint32x4_t d = vpadalq_u16(c, b); + const uint64x2_t e = vpaddlq_u32(d); + return vadd_u32(vreinterpret_u32_u64(vget_low_u64(e)), + vreinterpret_u32_u64(vget_high_u64(e))); +} + +static INLINE uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) { + const uint64x2_t b = vpaddlq_u32(a); + return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); +} +#endif // VPX_DSP_ARM_SUM_NEON_H_ diff --git a/libvpx/vpx_dsp/arm/variance_neon.c b/libvpx/vpx_dsp/arm/variance_neon.c index a6b2c53b7..61c2c16a7 100644 --- a/libvpx/vpx_dsp/arm/variance_neon.c +++ b/libvpx/vpx_dsp/arm/variance_neon.c @@ -16,23 +16,9 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" #include "vpx_ports/mem.h" -static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { - const int32x4_t a = vpaddlq_s16(v_16x8); - const int64x2_t b = vpaddlq_s32(a); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - return vget_lane_s32(c, 0); -} - -static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { - const int64x2_t b = vpaddlq_s32(v_32x4); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - return vget_lane_s32(c, 0); -} - // The variance helper functions use int16_t for sum. 8 values are accumulated // and then added (at which point they expand up to int32_t). To avoid overflow, // there can be no more than 32767 / 255 ~= 128 values accumulated in each @@ -79,8 +65,10 @@ static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b, b += 4 * b_stride; } - *sum = horizontal_add_s16x8(sum_s16); - *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32)); + *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0); + *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32( + vaddq_s32(sse_lo_s32, sse_hi_s32))), + 0); } // Process a block of any size where the width is divisible by 16. @@ -126,8 +114,10 @@ static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b, b += b_stride; } - *sum = horizontal_add_s16x8(sum_s16); - *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32)); + *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0); + *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32( + vaddq_s32(sse_lo_s32, sse_hi_s32))), + 0); } // Process a block of width 8 two rows at a time. @@ -165,8 +155,10 @@ static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b, i += 2; } while (i < h); - *sum = horizontal_add_s16x8(sum_s16); - *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32)); + *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0); + *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32( + vaddq_s32(sse_lo_s32, sse_hi_s32))), + 0); } void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b, diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm b/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm index e279d570f..1c2ee5063 100644 --- a/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm +++ b/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm @@ -42,10 +42,11 @@ ; r1 int src_stride ; r2 uint8_t *dst ; r3 int dst_stride -; sp[]const int16_t *filter_x -; sp[]int x_step_q4 -; sp[]const int16_t *filter_y ; unused -; sp[]int y_step_q4 ; unused +; sp[]const int16_t *filter +; sp[]int x0_q4 +; sp[]int x_step_q4 ; unused +; sp[]int y0_q4 +; sp[]int y_step_q4 ; unused ; sp[]int w ; sp[]int h @@ -54,11 +55,11 @@ sub r0, r0, #3 ; adjust for taps - ldr r5, [sp, #32] ; filter_x - ldr r6, [sp, #48] ; w - ldr r7, [sp, #52] ; h + ldrd r4, r5, [sp, #32] ; filter, x0_q4 + add r4, r5, lsl #4 + ldrd r6, r7, [sp, #52] ; w, h - vld1.s16 {q0}, [r5] ; filter_x + vld1.s16 {q0}, [r4] ; filter sub r8, r1, r1, lsl #2 ; -src_stride * 3 add r8, r8, #4 ; -src_stride * 3 + 4 @@ -127,7 +128,7 @@ vpx_convolve8_avg_loop_horiz sub r2, r2, r3, lsl #2 ; reset for store - ; src[] * filter_x + ; src[] * filter MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 @@ -184,11 +185,13 @@ vpx_convolve8_avg_loop_horiz sub r0, r0, r1 sub r0, r0, r1, lsl #1 - ldr r4, [sp, #32] ; filter_y - ldr r6, [sp, #40] ; w - ldr lr, [sp, #44] ; h + ldr r4, [sp, #24] ; filter + ldr r5, [sp, #36] ; y0_q4 + add r4, r5, lsl #4 + ldr r6, [sp, #44] ; w + ldr lr, [sp, #48] ; h - vld1.s16 {q0}, [r4] ; filter_y + vld1.s16 {q0}, [r4] ; filter lsl r1, r1, #1 lsl r3, r3, #1 @@ -232,7 +235,7 @@ vpx_convolve8_avg_loop_vert pld [r7] pld [r4] - ; src[] * filter_y + ; src[] * filter MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 pld [r7, r1] diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c index 1386838ee..08ae17dba 100644 --- a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c @@ -15,6 +15,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" #include "vpx_ports/mem.h" // Note: @@ -29,43 +30,11 @@ // instructions. This optimization is much faster in speed unit test, but slowed // down the whole decoder by 5%. -static INLINE void load_8x4(const uint8_t *s, ptrdiff_t p, uint8x8_t *s0, - uint8x8_t *s1, uint8x8_t *s2, uint8x8_t *s3) { - *s0 = vld1_u8(s); - s += p; - *s1 = vld1_u8(s); - s += p; - *s2 = vld1_u8(s); - s += p; - *s3 = vld1_u8(s); -} - -static INLINE void load_8x8(const uint8_t *s, ptrdiff_t p, uint8x8_t *s0, - uint8x8_t *s1, uint8x8_t *s2, uint8x8_t *s3, - uint8x8_t *s4, uint8x8_t *s5, uint8x8_t *s6, - uint8x8_t *s7) { - *s0 = vld1_u8(s); - s += p; - *s1 = vld1_u8(s); - s += p; - *s2 = vld1_u8(s); - s += p; - *s3 = vld1_u8(s); - s += p; - *s4 = vld1_u8(s); - s += p; - *s5 = vld1_u8(s); - s += p; - *s6 = vld1_u8(s); - s += p; - *s7 = vld1_u8(s); -} - -static INLINE void store_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, - const uint8x8_t s1, const uint8x8_t s2, - const uint8x8_t s3, const uint8x8_t s4, - const uint8x8_t s5, const uint8x8_t s6, - const uint8x8_t s7) { +static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p, + const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2, const uint8x8_t s3, + const uint8x8_t s4, const uint8x8_t s5, + const uint8x8_t s6, const uint8x8_t s7) { vst1_u8(s, s0); s += p; vst1_u8(s, s1); @@ -83,53 +52,12 @@ static INLINE void store_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, vst1_u8(s, s7); } -static INLINE int16x4_t convolve8_4(int16x4_t s0, int16x4_t s1, int16x4_t s2, - int16x4_t s3, int16x4_t s4, int16x4_t s5, - int16x4_t s6, int16x4_t s7, - int16x8_t filters, int16x4_t filter3, - int16x4_t filter4) { - const int16x4_t filters_lo = vget_low_s16(filters); - const int16x4_t filters_hi = vget_high_s16(filters); - int16x4_t sum = vdup_n_s16(0); - - sum = vmla_lane_s16(sum, s0, filters_lo, 0); - sum = vmla_lane_s16(sum, s1, filters_lo, 1); - sum = vmla_lane_s16(sum, s2, filters_lo, 2); - sum = vmla_lane_s16(sum, s5, filters_hi, 1); - sum = vmla_lane_s16(sum, s6, filters_hi, 2); - sum = vmla_lane_s16(sum, s7, filters_hi, 3); - sum = vqadd_s16(sum, vmul_s16(s3, filter3)); - sum = vqadd_s16(sum, vmul_s16(s4, filter4)); - return sum; -} - -static INLINE int16x8_t convolve8_8(int16x8_t s0, int16x8_t s1, int16x8_t s2, - int16x8_t s3, int16x8_t s4, int16x8_t s5, - int16x8_t s6, int16x8_t s7, - int16x8_t filters, int16x8_t filter3, - int16x8_t filter4) { - const int16x4_t filters_lo = vget_low_s16(filters); - const int16x4_t filters_hi = vget_high_s16(filters); - int16x8_t sum = vdupq_n_s16(0); - - sum = vmlaq_lane_s16(sum, s0, filters_lo, 0); - sum = vmlaq_lane_s16(sum, s1, filters_lo, 1); - sum = vmlaq_lane_s16(sum, s2, filters_lo, 2); - sum = vmlaq_lane_s16(sum, s5, filters_hi, 1); - sum = vmlaq_lane_s16(sum, s6, filters_hi, 2); - sum = vmlaq_lane_s16(sum, s7, filters_hi, 3); - sum = vqaddq_s16(sum, vmulq_s16(s3, filter3)); - sum = vqaddq_s16(sum, vmulq_s16(s4, filter4)); - return sum; -} - void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused - int w, int h) { - const int16x8_t filters = vld1q_s16(filter_x); + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16x8_t filters = vld1q_s16(filter[x0_q4]); uint8x8_t t0, t1, t2, t3; assert(!((intptr_t)dst & 3)); @@ -137,8 +65,8 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, assert(x_step_q4 == 16); (void)x_step_q4; + (void)y0_q4; (void)y_step_q4; - (void)filter_y; src -= 3; @@ -154,7 +82,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 3 * src_stride); filter3 = vdup_lane_s16(vget_low_s16(filters), 3); filter4 = vdup_lane_s16(vget_high_s16(filters), 0); - load_8x4(src, src_stride, &t0, &t1, &t2, &t3); + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); @@ -174,7 +102,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src += 7; do { - load_8x4(src, src_stride, &t0, &t1, &t2, &t3); + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); @@ -224,11 +152,11 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, int width; const uint8_t *s; uint8x8_t t4, t5, t6, t7; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; if (w == 4) { do { - load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); @@ -238,7 +166,8 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); - load_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, + &t7); src += 8 * src_stride; __builtin_prefetch(dst + 0 * dst_stride); __builtin_prefetch(dst + 1 * dst_stride); @@ -248,7 +177,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(dst + 5 * dst_stride); __builtin_prefetch(dst + 6 * dst_stride); __builtin_prefetch(dst + 7 * dst_stride); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7); s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); @@ -262,19 +191,15 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); __builtin_prefetch(src + 7 * src_stride); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, filter4); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, filter4); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, filter4); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, filter4); - t0 = vqrshrun_n_s16(d0, 7); - t1 = vqrshrun_n_s16(d1, 7); - t2 = vqrshrun_n_s16(d2, 7); - t3 = vqrshrun_n_s16(d3, 7); transpose_u8_8x4(&t0, &t1, &t2, &t3); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0); dst += dst_stride; @@ -296,7 +221,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } while (h > 0); } else { uint8_t *d; - int16x8_t s11, s12, s13, s14, d4, d5, d6, d7; + int16x8_t s11, s12, s13, s14; do { __builtin_prefetch(src + 0 * src_stride); @@ -307,7 +232,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); __builtin_prefetch(src + 7 * src_stride); - load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); @@ -330,7 +255,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(dst + 7 * dst_stride); do { - load_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); @@ -341,33 +266,25 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, filter4); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, filter4); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, filter4); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, filter4); - d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3, + t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3, filter4); - d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3, + t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3, filter4); - d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3, + t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3, filter4); - d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, + t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, filter3, filter4); - t0 = vqrshrun_n_s16(d0, 7); - t1 = vqrshrun_n_s16(d1, 7); - t2 = vqrshrun_n_s16(d2, 7); - t3 = vqrshrun_n_s16(d3, 7); - t4 = vqrshrun_n_s16(d4, 7); - t5 = vqrshrun_n_s16(d5, 7); - t6 = vqrshrun_n_s16(d6, 7); - t7 = vqrshrun_n_s16(d7, 7); transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - store_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7); + store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7); s0 = s8; s1 = s9; @@ -390,11 +307,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const int16x8_t filters = vld1q_s16(filter_x); + const int16x8_t filters = vld1q_s16(filter[x0_q4]); uint8x8_t t0, t1, t2, t3; assert(!((intptr_t)dst & 3)); @@ -402,8 +318,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, assert(x_step_q4 == 16); (void)x_step_q4; + (void)y0_q4; (void)y_step_q4; - (void)filter_y; src -= 3; @@ -420,7 +336,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 3 * src_stride); filter3 = vdup_lane_s16(vget_low_s16(filters), 3); filter4 = vdup_lane_s16(vget_high_s16(filters), 0); - load_8x4(src, src_stride, &t0, &t1, &t2, &t3); + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); @@ -440,7 +356,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src += 7; do { - load_8x4(src, src_stride, &t0, &t1, &t2, &t3); + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); @@ -493,13 +409,13 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, int width; const uint8_t *s; uint8x8_t t4, t5, t6, t7; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; if (w == 4) { uint32x4_t d0415 = vdupq_n_u32(0); uint32x4_t d2637 = vdupq_n_u32(0); do { - load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); @@ -509,7 +425,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); - load_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, + &t7); src += 8 * src_stride; __builtin_prefetch(dst + 0 * dst_stride); __builtin_prefetch(dst + 1 * dst_stride); @@ -519,7 +436,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(dst + 5 * dst_stride); __builtin_prefetch(dst + 6 * dst_stride); __builtin_prefetch(dst + 7 * dst_stride); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7); s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); @@ -533,19 +450,15 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); __builtin_prefetch(src + 7 * src_stride); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, filter4); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, filter4); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, filter4); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, filter4); - t0 = vqrshrun_n_s16(d0, 7); - t1 = vqrshrun_n_s16(d1, 7); - t2 = vqrshrun_n_s16(d2, 7); - t3 = vqrshrun_n_s16(d3, 7); transpose_u8_8x4(&t0, &t1, &t2, &t3); d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0); @@ -581,7 +494,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } while (h > 0); } else { uint8_t *d; - int16x8_t s11, s12, s13, s14, d4, d5, d6, d7; + int16x8_t s11, s12, s13, s14; uint8x16_t d01, d23, d45, d67; do { @@ -593,7 +506,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); __builtin_prefetch(src + 7 * src_stride); - load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); @@ -616,7 +529,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(dst + 7 * dst_stride); do { - load_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); @@ -627,31 +540,23 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, filter4); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, filter4); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, filter4); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, filter4); - d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3, + t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3, filter4); - d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3, + t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3, filter4); - d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3, + t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3, filter4); - d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, + t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, filter3, filter4); - t0 = vqrshrun_n_s16(d0, 7); - t1 = vqrshrun_n_s16(d1, 7); - t2 = vqrshrun_n_s16(d2, 7); - t3 = vqrshrun_n_s16(d3, 7); - t4 = vqrshrun_n_s16(d4, 7); - t5 = vqrshrun_n_s16(d5, 7); - t6 = vqrshrun_n_s16(d6, 7); - t7 = vqrshrun_n_s16(d7, 7); transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride), @@ -667,9 +572,9 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5)); d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7)); - store_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01), - vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45), - vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67)); + store_u8_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01), + vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45), + vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67)); s0 = s8; s1 = s9; @@ -692,19 +597,18 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const int16x8_t filters = vld1q_s16(filter_y); + const int16x8_t filters = vld1q_s16(filter[y0_q4]); assert(!((intptr_t)dst & 3)); assert(!(dst_stride & 3)); assert(y_step_q4 == 16); + (void)x0_q4; (void)x_step_q4; (void)y_step_q4; - (void)filter_x; src -= 3 * src_stride; @@ -782,7 +686,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, int height; const uint8_t *s; uint8_t *d; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; + uint8x8_t t0, t1, t2, t3; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; do { __builtin_prefetch(src + 0 * src_stride); @@ -828,22 +733,22 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, filter4); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, filter4); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, filter4); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, filter4); - vst1_u8(d, vqrshrun_n_s16(d0, 7)); + vst1_u8(d, t0); d += dst_stride; - vst1_u8(d, vqrshrun_n_s16(d1, 7)); + vst1_u8(d, t1); d += dst_stride; - vst1_u8(d, vqrshrun_n_s16(d2, 7)); + vst1_u8(d, t2); d += dst_stride; - vst1_u8(d, vqrshrun_n_s16(d3, 7)); + vst1_u8(d, t3); d += dst_stride; s0 = s4; @@ -864,19 +769,18 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const int16x8_t filters = vld1q_s16(filter_y); + const int16x8_t filters = vld1q_s16(filter[y0_q4]); assert(!((intptr_t)dst & 3)); assert(!(dst_stride & 3)); assert(y_step_q4 == 16); + (void)x0_q4; (void)x_step_q4; (void)y_step_q4; - (void)filter_x; src -= 3 * src_stride; @@ -963,8 +867,9 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, int height; const uint8_t *s; uint8_t *d; + uint8x8_t t0, t1, t2, t3; uint8x16_t d01, d23, dd01, dd23; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; do { __builtin_prefetch(src + 0 * src_stride); @@ -1010,17 +915,17 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, filter4); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, filter4); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, filter4); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, filter4); - d01 = vcombine_u8(vqrshrun_n_s16(d0, 7), vqrshrun_n_s16(d1, 7)); - d23 = vcombine_u8(vqrshrun_n_s16(d2, 7), vqrshrun_n_s16(d3, 7)); + d01 = vcombine_u8(t0, t1); + d23 = vcombine_u8(t2, t3); dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride), vld1_u8(d + 1 * dst_stride)); dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride), diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h new file mode 100644 index 000000000..c1634ed55 --- /dev/null +++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); +} + +static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3, + uint8x8_t *const s4, uint8x8_t *const s5, + uint8x8_t *const s6, uint8x8_t *const s7) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); + s += p; + *s4 = vld1_u8(s); + s += p; + *s5 = vld1_u8(s); + s += p; + *s6 = vld1_u8(s); + s += p; + *s7 = vld1_u8(s); +} + +static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2, uint8x16_t *const s3, + uint8x16_t *const s4, uint8x16_t *const s5, + uint8x16_t *const s6, uint8x16_t *const s7) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); + s += p; + *s3 = vld1q_u8(s); + s += p; + *s4 = vld1q_u8(s); + s += p; + *s5 = vld1q_u8(s); + s += p; + *s6 = vld1q_u8(s); + s += p; + *s7 = vld1q_u8(s); +} + +static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t filters, + const int16x4_t filter3, + const int16x4_t filter4) { + const int16x4_t filters_lo = vget_low_s16(filters); + const int16x4_t filters_hi = vget_high_s16(filters); + int16x4_t sum; + + sum = vmul_lane_s16(s0, filters_lo, 0); + sum = vmla_lane_s16(sum, s1, filters_lo, 1); + sum = vmla_lane_s16(sum, s2, filters_lo, 2); + sum = vmla_lane_s16(sum, s5, filters_hi, 1); + sum = vmla_lane_s16(sum, s6, filters_hi, 2); + sum = vmla_lane_s16(sum, s7, filters_hi, 3); + sum = vqadd_s16(sum, vmul_s16(s3, filter3)); + sum = vqadd_s16(sum, vmul_s16(s4, filter4)); + return sum; +} + +static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filters, + const int16x8_t filter3, + const int16x8_t filter4) { + const int16x4_t filters_lo = vget_low_s16(filters); + const int16x4_t filters_hi = vget_high_s16(filters); + int16x8_t sum; + + sum = vmulq_lane_s16(s0, filters_lo, 0); + sum = vmlaq_lane_s16(sum, s1, filters_lo, 1); + sum = vmlaq_lane_s16(sum, s2, filters_lo, 2); + sum = vmlaq_lane_s16(sum, s5, filters_hi, 1); + sum = vmlaq_lane_s16(sum, s6, filters_hi, 2); + sum = vmlaq_lane_s16(sum, s7, filters_hi, 3); + sum = vqaddq_s16(sum, vmulq_s16(s3, filter3)); + sum = vqaddq_s16(sum, vmulq_s16(s4, filter4)); + return vqrshrun_n_s16(sum, 7); +} + +static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s, + const int16x8_t filters) { + const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); + const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); + int16x8_t ss[8]; + + ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0])); + ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1])); + ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2])); + ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3])); + ss[4] = vreinterpretq_s16_u16(vmovl_u8(s[4])); + ss[5] = vreinterpretq_s16_u16(vmovl_u8(s[5])); + ss[6] = vreinterpretq_s16_u16(vmovl_u8(s[6])); + ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7])); + + return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7], + filters, filter3, filter4); +} diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm b/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm index 2d0f2ae06..5eee15664 100644 --- a/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm +++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm @@ -42,10 +42,11 @@ ; r1 int src_stride ; r2 uint8_t *dst ; r3 int dst_stride -; sp[]const int16_t *filter_x -; sp[]int x_step_q4 -; sp[]const int16_t *filter_y ; unused -; sp[]int y_step_q4 ; unused +; sp[]const int16_t *filter +; sp[]int x0_q4 +; sp[]int x_step_q4 ; unused +; sp[]int y0_q4 +; sp[]int y_step_q4 ; unused ; sp[]int w ; sp[]int h @@ -54,11 +55,11 @@ sub r0, r0, #3 ; adjust for taps - ldr r5, [sp, #32] ; filter_x - ldr r6, [sp, #48] ; w - ldr r7, [sp, #52] ; h + ldrd r4, r5, [sp, #32] ; filter, x0_q4 + add r4, r5, lsl #4 + ldrd r6, r7, [sp, #52] ; w, h - vld1.s16 {q0}, [r5] ; filter_x + vld1.s16 {q0}, [r4] ; filter sub r8, r1, r1, lsl #2 ; -src_stride * 3 add r8, r8, #4 ; -src_stride * 3 + 4 @@ -119,7 +120,7 @@ vpx_convolve8_loop_horiz pld [r5, r1, lsl #1] - ; src[] * filter_x + ; src[] * filter MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 @@ -173,11 +174,13 @@ vpx_convolve8_loop_horiz sub r0, r0, r1 sub r0, r0, r1, lsl #1 - ldr r4, [sp, #32] ; filter_y - ldr r6, [sp, #40] ; w - ldr lr, [sp, #44] ; h + ldr r4, [sp, #24] ; filter + ldr r5, [sp, #36] ; y0_q4 + add r4, r5, lsl #4 + ldr r6, [sp, #44] ; w + ldr lr, [sp, #48] ; h - vld1.s16 {q0}, [r4] ; filter_y + vld1.s16 {q0}, [r4] ; filter lsl r1, r1, #1 lsl r3, r3, #1 @@ -216,7 +219,7 @@ vpx_convolve8_loop_vert pld [r5] pld [r8] - ; src[] * filter_y + ; src[] * filter MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 pld [r5, r3] diff --git a/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c index 04cb835fa..07349d03a 100644 --- a/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c +++ b/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c @@ -15,13 +15,13 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, int w, - int h) { - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; if (w < 8) { // avg4 uint8x8_t s0, s1; diff --git a/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm b/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm index 97e6189fd..efd6574f1 100644 --- a/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm +++ b/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm @@ -17,7 +17,7 @@ |vpx_convolve_avg_neon| PROC push {r4-r6, lr} - ldrd r4, r5, [sp, #32] + ldrd r4, r5, [sp, #36] mov r6, r2 cmp r4, #32 diff --git a/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c index a8f690acd..7abed67a4 100644 --- a/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c +++ b/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c @@ -15,13 +15,14 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; if (w < 8) { // copy4 do { diff --git a/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm b/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm index 89164ad48..7a66e3ce2 100644 --- a/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm +++ b/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm @@ -17,7 +17,7 @@ |vpx_convolve_copy_neon| PROC push {r4-r5, lr} - ldrd r4, r5, [sp, #28] + ldrd r4, r5, [sp, #32] cmp r4, #32 bgt copy64 diff --git a/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve_neon.c index 6ca0e501b..2bf2d890b 100644 --- a/libvpx/vpx_dsp/arm/vpx_convolve_neon.c +++ b/libvpx/vpx_dsp/arm/vpx_convolve_neon.c @@ -15,13 +15,13 @@ #include "vpx_ports/mem.h" void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4). */ - DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); + uint8_t temp[64 * 72]; // Account for the vertical phase needing 3 lines prior and 4 lines post const int intermediate_height = h + 7; @@ -33,21 +33,21 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, * height and filter a multiple of 4 lines. Since this goes in to the temp * buffer which has lots of extra room and is subsequently discarded this is * safe if somewhat less than ideal. */ - vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x, - x_step_q4, filter_y, y_step_q4, w, + vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, intermediate_height); /* Step into the temp buffer 3 lines to get the actual frame data */ - vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); + uint8_t temp[64 * 72]; const int intermediate_height = h + 7; assert(y_step_q4 == 16); @@ -56,9 +56,9 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, /* This implementation has the same issues as above. In addition, we only want * to average the values after both passes. */ - vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x, - x_step_q4, filter_y, y_step_q4, w, + vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, intermediate_height); - vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } diff --git a/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c new file mode 100644 index 000000000..8edf8a66e --- /dev/null +++ b/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c @@ -0,0 +1,324 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> +#include <string.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_ports/mem.h" + +static INLINE void scaledconvolve_horiz_w4( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const x_filters, + const int x0_q4, const int x_step_q4, const int w, const int h) { + DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); + int x, y, z; + + src -= SUBPEL_TAPS / 2 - 1; + + y = h; + do { + int x_q4 = x0_q4; + x = 0; + do { + // process 4 src_x steps + for (z = 0; z < 4; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + if (x_q4 & SUBPEL_MASK) { + const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); + const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); + const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); + uint8x8_t s[8], d; + int16x8_t ss[4]; + int16x4_t t[8], tt; + + load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]); + transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]); + + ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0])); + ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1])); + ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2])); + ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3])); + t[0] = vget_low_s16(ss[0]); + t[1] = vget_low_s16(ss[1]); + t[2] = vget_low_s16(ss[2]); + t[3] = vget_low_s16(ss[3]); + t[4] = vget_high_s16(ss[0]); + t[5] = vget_high_s16(ss[1]); + t[6] = vget_high_s16(ss[2]); + t[7] = vget_high_s16(ss[3]); + + tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], + filters, filter3, filter4); + d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); + vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0); + } else { + int i; + for (i = 0; i < 4; ++i) { + temp[z * 4 + i] = src_x[i * src_stride + 3]; + } + } + x_q4 += x_step_q4; + } + + // transpose the 4x4 filters values back to dst + { + const uint8x8x4_t d4 = vld4_u8(temp); + vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride], + vreinterpret_u32_u8(d4.val[0]), 0); + vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride], + vreinterpret_u32_u8(d4.val[1]), 0); + vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride], + vreinterpret_u32_u8(d4.val[2]), 0); + vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride], + vreinterpret_u32_u8(d4.val[3]), 0); + } + x += 4; + } while (x < w); + + src += src_stride * 4; + dst += dst_stride * 4; + y -= 4; + } while (y > 0); +} + +static INLINE void scaledconvolve_horiz_w8( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const x_filters, + const int x0_q4, const int x_step_q4, const int w, const int h) { + DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); + int x, y, z; + src -= SUBPEL_TAPS / 2 - 1; + + // This function processes 8x8 areas. The intermediate height is not always + // a multiple of 8, so force it to be a multiple of 8 here. + y = (h + 7) & ~7; + + do { + int x_q4 = x0_q4; + x = 0; + do { + uint8x8_t d[8]; + // process 8 src_x steps + for (z = 0; z < 8; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + + if (x_q4 & SUBPEL_MASK) { + const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); + uint8x8_t s[8]; + load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], + &s[5], &s[6], &s[7]); + transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + d[0] = scale_filter_8(s, filters); + vst1_u8(&temp[8 * z], d[0]); + } else { + int i; + for (i = 0; i < 8; ++i) { + temp[z * 8 + i] = src_x[i * src_stride + 3]; + } + } + x_q4 += x_step_q4; + } + + // transpose the 8x8 filters values back to dst + load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], + &d[7]); + transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); + vst1_u8(&dst[x + 0 * dst_stride], d[0]); + vst1_u8(&dst[x + 1 * dst_stride], d[1]); + vst1_u8(&dst[x + 2 * dst_stride], d[2]); + vst1_u8(&dst[x + 3 * dst_stride], d[3]); + vst1_u8(&dst[x + 4 * dst_stride], d[4]); + vst1_u8(&dst[x + 5 * dst_stride], d[5]); + vst1_u8(&dst[x + 6 * dst_stride], d[6]); + vst1_u8(&dst[x + 7 * dst_stride], d[7]); + x += 8; + } while (x < w); + + src += src_stride * 8; + dst += dst_stride * 8; + } while (y -= 8); +} + +static INLINE void scaledconvolve_vert_w4( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + y = h; + do { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + + if (y_q4 & SUBPEL_MASK) { + const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); + const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); + const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); + uint8x8_t s[8], d; + int16x4_t t[8], tt; + + load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0]))); + t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1]))); + t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2]))); + t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3]))); + t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4]))); + t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5]))); + t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6]))); + t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7]))); + + tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters, + filter3, filter4); + d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); + } else { + memcpy(dst, &src_y[3 * src_stride], w); + } + + dst += dst_stride; + y_q4 += y_step_q4; + } while (--y); +} + +static INLINE void scaledconvolve_vert_w8( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + y = h; + do { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + if (y_q4 & SUBPEL_MASK) { + const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); + uint8x8_t s[8], d; + load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + d = scale_filter_8(s, filters); + vst1_u8(dst, d); + } else { + memcpy(dst, &src_y[3 * src_stride], w); + } + dst += dst_stride; + y_q4 += y_step_q4; + } while (--y); +} + +static INLINE void scaledconvolve_vert_w16( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int x, y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + y = h; + do { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + if (y_q4 & SUBPEL_MASK) { + x = 0; + do { + const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); + uint8x16_t ss[8]; + uint8x8_t s[8], d[2]; + load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4], + &ss[5], &ss[6], &ss[7]); + s[0] = vget_low_u8(ss[0]); + s[1] = vget_low_u8(ss[1]); + s[2] = vget_low_u8(ss[2]); + s[3] = vget_low_u8(ss[3]); + s[4] = vget_low_u8(ss[4]); + s[5] = vget_low_u8(ss[5]); + s[6] = vget_low_u8(ss[6]); + s[7] = vget_low_u8(ss[7]); + d[0] = scale_filter_8(s, filters); + + s[0] = vget_high_u8(ss[0]); + s[1] = vget_high_u8(ss[1]); + s[2] = vget_high_u8(ss[2]); + s[3] = vget_high_u8(ss[3]); + s[4] = vget_high_u8(ss[4]); + s[5] = vget_high_u8(ss[5]); + s[6] = vget_high_u8(ss[6]); + s[7] = vget_high_u8(ss[7]); + d[1] = scale_filter_8(s, filters); + vst1q_u8(&dst[x], vcombine_u8(d[0], d[1])); + src_y += 16; + x += 16; + } while (x < w); + } else { + memcpy(dst, &src_y[3 * src_stride], w); + } + dst += dst_stride; + y_q4 += y_step_q4; + } while (--y); +} + +void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // --Require an additional 8 rows for the horiz_w8 transpose tail. + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. + DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); + + if (w >= 8) { + scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); + } else { + scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); + } + + if (w >= 16) { + scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } else if (w == 8) { + scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } else { + scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } +} diff --git a/libvpx/vpx_dsp/avg.c b/libvpx/vpx_dsp/avg.c index e4cd6cca7..a7ac6d953 100644 --- a/libvpx/vpx_dsp/avg.c +++ b/libvpx/vpx_dsp/avg.c @@ -34,7 +34,7 @@ unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) { // src_diff: first pass, 9 bit, dynamic range [-255, 255] // second pass, 12 bit, dynamic range [-2040, 2040] -static void hadamard_col8(const int16_t *src_diff, int src_stride, +static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; @@ -66,7 +66,7 @@ static void hadamard_col8(const int16_t *src_diff, int src_stride, // The order of the output coeff of the hadamard is not important. For // optimization purposes the final transpose may be skipped. -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int idx; int16_t buffer[64]; @@ -92,7 +92,7 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, } // In place 16x16 2D Hadamard transform -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int idx; for (idx = 0; idx < 4; ++idx) { diff --git a/libvpx/vpx_dsp/deblock.c b/libvpx/vpx_dsp/deblock.c index 3734ac251..235e85793 100644 --- a/libvpx/vpx_dsp/deblock.c +++ b/libvpx/vpx_dsp/deblock.c @@ -9,6 +9,7 @@ */ #include <assert.h> #include <stdlib.h> +#include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" DECLARE_PROTECTED(const int16_t vpx_rv[]) = { diff --git a/libvpx/vpx_dsp/fwd_txfm.c b/libvpx/vpx_dsp/fwd_txfm.c index aa5960109..6dcb3ba66 100644 --- a/libvpx/vpx_dsp/fwd_txfm.c +++ b/libvpx/vpx_dsp/fwd_txfm.c @@ -84,7 +84,7 @@ void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) { for (r = 0; r < 4; ++r) for (c = 0; c < 4; ++c) sum += input[r * stride + c]; - output[0] = sum << 1; + output[0] = sum * 2; } void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { diff --git a/libvpx/vpx_dsp/intrapred.c b/libvpx/vpx_dsp/intrapred.c index 9e2048ebf..400e632e9 100644 --- a/libvpx/vpx_dsp/intrapred.c +++ b/libvpx/vpx_dsp/intrapred.c @@ -489,30 +489,39 @@ static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { int r, c; + int size; (void)left; (void)bd; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], - above[(r >> 1) + c + 2]) - : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); - } - dst += stride; + for (c = 0; c < bs; ++c) { + dst[c] = AVG2(above[c], above[c + 1]); + dst[stride + c] = AVG3(above[c], above[c + 1], above[c + 2]); + } + for (r = 2, size = bs - 2; r < bs; r += 2, --size) { + memcpy(dst + (r + 0) * stride, dst + (r >> 1), size * sizeof(*dst)); + vpx_memset16(dst + (r + 0) * stride + size, above[bs - 1], bs - size); + memcpy(dst + (r + 1) * stride, dst + stride + (r >> 1), + size * sizeof(*dst)); + vpx_memset16(dst + (r + 1) * stride + size, above[bs - 1], bs - size); } } static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { - int r, c; + const uint16_t above_right = above[bs - 1]; + const uint16_t *const dst_row0 = dst; + int x, size; (void)left; (void)bd; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - dst[c] = r + c + 2 < bs * 2 - ? AVG3(above[r + c], above[r + c + 1], above[r + c + 2]) - : above[bs * 2 - 1]; - } + + for (x = 0; x < bs - 1; ++x) { + dst[x] = AVG3(above[x], above[x + 1], above[x + 2]); + } + dst[bs - 1] = above_right; + dst += stride; + for (x = 1, size = bs - 2; x < bs; ++x, --size) { + memcpy(dst, dst_row0 + x, size * sizeof(*dst)); + vpx_memset16(dst + size, above_right, x + 1); dst += stride; } } diff --git a/libvpx/vpx_dsp/inv_txfm.c b/libvpx/vpx_dsp/inv_txfm.c index 29323d1b8..0194aa1e1 100644 --- a/libvpx/vpx_dsp/inv_txfm.c +++ b/libvpx/vpx_dsp/inv_txfm.c @@ -105,6 +105,7 @@ void iadst4_c(const tran_low_t *input, tran_low_t *output) { return; } + // 32-bit result is enough for the following multiplications. s0 = sinpi_1_9 * x0; s1 = sinpi_2_9 * x0; s2 = sinpi_3_9 * x1; @@ -130,16 +131,16 @@ void iadst4_c(const tran_low_t *input, tran_low_t *output) { } void idct4_c(const tran_low_t *input, tran_low_t *output) { - tran_low_t step[4]; + int16_t step[4]; tran_high_t temp1, temp2; // stage 1 - temp1 = (input[0] + input[2]) * cospi_16_64; - temp2 = (input[0] - input[2]) * cospi_16_64; + temp1 = ((int16_t)input[0] + (int16_t)input[2]) * cospi_16_64; + temp2 = ((int16_t)input[0] - (int16_t)input[2]) * cospi_16_64; step[0] = WRAPLOW(dct_const_round_shift(temp1)); step[1] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; - temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + temp1 = (int16_t)input[1] * cospi_24_64 - (int16_t)input[3] * cospi_8_64; + temp2 = (int16_t)input[1] * cospi_8_64 + (int16_t)input[3] * cospi_24_64; step[2] = WRAPLOW(dct_const_round_shift(temp1)); step[3] = WRAPLOW(dct_const_round_shift(temp2)); @@ -177,7 +178,8 @@ void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 4); @@ -267,20 +269,20 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) { } void idct8_c(const tran_low_t *input, tran_low_t *output) { - tran_low_t step1[8], step2[8]; + int16_t step1[8], step2[8]; tran_high_t temp1, temp2; // stage 1 - step1[0] = input[0]; - step1[2] = input[4]; - step1[1] = input[2]; - step1[3] = input[6]; - temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; - temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1[0] = (int16_t)input[0]; + step1[2] = (int16_t)input[4]; + step1[1] = (int16_t)input[2]; + step1[3] = (int16_t)input[6]; + temp1 = (int16_t)input[1] * cospi_28_64 - (int16_t)input[7] * cospi_4_64; + temp2 = (int16_t)input[1] * cospi_4_64 + (int16_t)input[7] * cospi_28_64; step1[4] = WRAPLOW(dct_const_round_shift(temp1)); step1[7] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; - temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + temp1 = (int16_t)input[5] * cospi_12_64 - (int16_t)input[3] * cospi_20_64; + temp2 = (int16_t)input[5] * cospi_20_64 + (int16_t)input[3] * cospi_12_64; step1[5] = WRAPLOW(dct_const_round_shift(temp1)); step1[6] = WRAPLOW(dct_const_round_shift(temp2)); @@ -373,7 +375,8 @@ void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 5); @@ -552,26 +555,26 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) { } void idct16_c(const tran_low_t *input, tran_low_t *output) { - tran_low_t step1[16], step2[16]; + int16_t step1[16], step2[16]; tran_high_t temp1, temp2; // stage 1 - step1[0] = input[0 / 2]; - step1[1] = input[16 / 2]; - step1[2] = input[8 / 2]; - step1[3] = input[24 / 2]; - step1[4] = input[4 / 2]; - step1[5] = input[20 / 2]; - step1[6] = input[12 / 2]; - step1[7] = input[28 / 2]; - step1[8] = input[2 / 2]; - step1[9] = input[18 / 2]; - step1[10] = input[10 / 2]; - step1[11] = input[26 / 2]; - step1[12] = input[6 / 2]; - step1[13] = input[22 / 2]; - step1[14] = input[14 / 2]; - step1[15] = input[30 / 2]; + step1[0] = (int16_t)input[0 / 2]; + step1[1] = (int16_t)input[16 / 2]; + step1[2] = (int16_t)input[8 / 2]; + step1[3] = (int16_t)input[24 / 2]; + step1[4] = (int16_t)input[4 / 2]; + step1[5] = (int16_t)input[20 / 2]; + step1[6] = (int16_t)input[12 / 2]; + step1[7] = (int16_t)input[28 / 2]; + step1[8] = (int16_t)input[2 / 2]; + step1[9] = (int16_t)input[18 / 2]; + step1[10] = (int16_t)input[10 / 2]; + step1[11] = (int16_t)input[26 / 2]; + step1[12] = (int16_t)input[6 / 2]; + step1[13] = (int16_t)input[22 / 2]; + step1[14] = (int16_t)input[14 / 2]; + step1[15] = (int16_t)input[30 / 2]; // stage 2 step2[0] = step1[0]; @@ -796,7 +799,8 @@ void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 6); @@ -807,64 +811,64 @@ void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { } void idct32_c(const tran_low_t *input, tran_low_t *output) { - tran_low_t step1[32], step2[32]; + int16_t step1[32], step2[32]; tran_high_t temp1, temp2; // stage 1 - step1[0] = input[0]; - step1[1] = input[16]; - step1[2] = input[8]; - step1[3] = input[24]; - step1[4] = input[4]; - step1[5] = input[20]; - step1[6] = input[12]; - step1[7] = input[28]; - step1[8] = input[2]; - step1[9] = input[18]; - step1[10] = input[10]; - step1[11] = input[26]; - step1[12] = input[6]; - step1[13] = input[22]; - step1[14] = input[14]; - step1[15] = input[30]; - - temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; - temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; + step1[0] = (int16_t)input[0]; + step1[1] = (int16_t)input[16]; + step1[2] = (int16_t)input[8]; + step1[3] = (int16_t)input[24]; + step1[4] = (int16_t)input[4]; + step1[5] = (int16_t)input[20]; + step1[6] = (int16_t)input[12]; + step1[7] = (int16_t)input[28]; + step1[8] = (int16_t)input[2]; + step1[9] = (int16_t)input[18]; + step1[10] = (int16_t)input[10]; + step1[11] = (int16_t)input[26]; + step1[12] = (int16_t)input[6]; + step1[13] = (int16_t)input[22]; + step1[14] = (int16_t)input[14]; + step1[15] = (int16_t)input[30]; + + temp1 = (int16_t)input[1] * cospi_31_64 - (int16_t)input[31] * cospi_1_64; + temp2 = (int16_t)input[1] * cospi_1_64 + (int16_t)input[31] * cospi_31_64; step1[16] = WRAPLOW(dct_const_round_shift(temp1)); step1[31] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; - temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; + temp1 = (int16_t)input[17] * cospi_15_64 - (int16_t)input[15] * cospi_17_64; + temp2 = (int16_t)input[17] * cospi_17_64 + (int16_t)input[15] * cospi_15_64; step1[17] = WRAPLOW(dct_const_round_shift(temp1)); step1[30] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; - temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; + temp1 = (int16_t)input[9] * cospi_23_64 - (int16_t)input[23] * cospi_9_64; + temp2 = (int16_t)input[9] * cospi_9_64 + (int16_t)input[23] * cospi_23_64; step1[18] = WRAPLOW(dct_const_round_shift(temp1)); step1[29] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; - temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; + temp1 = (int16_t)input[25] * cospi_7_64 - (int16_t)input[7] * cospi_25_64; + temp2 = (int16_t)input[25] * cospi_25_64 + (int16_t)input[7] * cospi_7_64; step1[19] = WRAPLOW(dct_const_round_shift(temp1)); step1[28] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; - temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; + temp1 = (int16_t)input[5] * cospi_27_64 - (int16_t)input[27] * cospi_5_64; + temp2 = (int16_t)input[5] * cospi_5_64 + (int16_t)input[27] * cospi_27_64; step1[20] = WRAPLOW(dct_const_round_shift(temp1)); step1[27] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; - temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; + temp1 = (int16_t)input[21] * cospi_11_64 - (int16_t)input[11] * cospi_21_64; + temp2 = (int16_t)input[21] * cospi_21_64 + (int16_t)input[11] * cospi_11_64; step1[21] = WRAPLOW(dct_const_round_shift(temp1)); step1[26] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; - temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; + temp1 = (int16_t)input[13] * cospi_19_64 - (int16_t)input[19] * cospi_13_64; + temp2 = (int16_t)input[13] * cospi_13_64 + (int16_t)input[19] * cospi_19_64; step1[22] = WRAPLOW(dct_const_round_shift(temp1)); step1[25] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; - temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; + temp1 = (int16_t)input[29] * cospi_3_64 - (int16_t)input[3] * cospi_29_64; + temp2 = (int16_t)input[29] * cospi_29_64 + (int16_t)input[3] * cospi_3_64; step1[23] = WRAPLOW(dct_const_round_shift(temp1)); step1[24] = WRAPLOW(dct_const_round_shift(temp2)); @@ -1259,7 +1263,8 @@ void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 6); @@ -1390,13 +1395,13 @@ void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { return; } - s0 = sinpi_1_9 * x0; - s1 = sinpi_2_9 * x0; - s2 = sinpi_3_9 * x1; - s3 = sinpi_4_9 * x2; - s4 = sinpi_1_9 * x2; - s5 = sinpi_2_9 * x3; - s6 = sinpi_4_9 * x3; + s0 = (tran_high_t)sinpi_1_9 * x0; + s1 = (tran_high_t)sinpi_2_9 * x0; + s2 = (tran_high_t)sinpi_3_9 * x1; + s3 = (tran_high_t)sinpi_4_9 * x2; + s4 = (tran_high_t)sinpi_1_9 * x2; + s5 = (tran_high_t)sinpi_2_9 * x3; + s6 = (tran_high_t)sinpi_4_9 * x3; s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd); s0 = s0 + s3 + s5; @@ -1428,12 +1433,14 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { } // stage 1 - temp1 = (input[0] + input[2]) * cospi_16_64; - temp2 = (input[0] - input[2]) * cospi_16_64; + temp1 = (input[0] + input[2]) * (tran_high_t)cospi_16_64; + temp2 = (input[0] - input[2]) * (tran_high_t)cospi_16_64; step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; - temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + temp1 = + input[1] * (tran_high_t)cospi_24_64 - input[3] * (tran_high_t)cospi_8_64; + temp2 = + input[1] * (tran_high_t)cospi_8_64 + input[3] * (tran_high_t)cospi_24_64; step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); @@ -1473,10 +1480,11 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i; tran_high_t a1; - tran_low_t out = - HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + tran_low_t out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); - out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 4); for (i = 0; i < 4; i++) { @@ -1514,14 +1522,14 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { } // stage 1 - s0 = cospi_2_64 * x0 + cospi_30_64 * x1; - s1 = cospi_30_64 * x0 - cospi_2_64 * x1; - s2 = cospi_10_64 * x2 + cospi_22_64 * x3; - s3 = cospi_22_64 * x2 - cospi_10_64 * x3; - s4 = cospi_18_64 * x4 + cospi_14_64 * x5; - s5 = cospi_14_64 * x4 - cospi_18_64 * x5; - s6 = cospi_26_64 * x6 + cospi_6_64 * x7; - s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + s0 = (tran_high_t)cospi_2_64 * x0 + (tran_high_t)cospi_30_64 * x1; + s1 = (tran_high_t)cospi_30_64 * x0 - (tran_high_t)cospi_2_64 * x1; + s2 = (tran_high_t)cospi_10_64 * x2 + (tran_high_t)cospi_22_64 * x3; + s3 = (tran_high_t)cospi_22_64 * x2 - (tran_high_t)cospi_10_64 * x3; + s4 = (tran_high_t)cospi_18_64 * x4 + (tran_high_t)cospi_14_64 * x5; + s5 = (tran_high_t)cospi_14_64 * x4 - (tran_high_t)cospi_18_64 * x5; + s6 = (tran_high_t)cospi_26_64 * x6 + (tran_high_t)cospi_6_64 * x7; + s7 = (tran_high_t)cospi_6_64 * x6 - (tran_high_t)cospi_26_64 * x7; x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd); x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd); @@ -1537,10 +1545,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { s1 = x1; s2 = x2; s3 = x3; - s4 = cospi_8_64 * x4 + cospi_24_64 * x5; - s5 = cospi_24_64 * x4 - cospi_8_64 * x5; - s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; - s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + s4 = (tran_high_t)cospi_8_64 * x4 + (tran_high_t)cospi_24_64 * x5; + s5 = (tran_high_t)cospi_24_64 * x4 - (tran_high_t)cospi_8_64 * x5; + s6 = (tran_high_t)(-cospi_24_64) * x6 + (tran_high_t)cospi_8_64 * x7; + s7 = (tran_high_t)cospi_8_64 * x6 + (tran_high_t)cospi_24_64 * x7; x0 = HIGHBD_WRAPLOW(s0 + s2, bd); x1 = HIGHBD_WRAPLOW(s1 + s3, bd); @@ -1552,10 +1560,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); // stage 3 - s2 = cospi_16_64 * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (x6 - x7); + s2 = (tran_high_t)cospi_16_64 * (x2 + x3); + s3 = (tran_high_t)cospi_16_64 * (x2 - x3); + s6 = (tran_high_t)cospi_16_64 * (x6 + x7); + s7 = (tran_high_t)cospi_16_64 * (x6 - x7); x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); @@ -1589,12 +1597,16 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { step1[2] = input[4]; step1[1] = input[2]; step1[3] = input[6]; - temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; - temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + temp1 = + input[1] * (tran_high_t)cospi_28_64 - input[7] * (tran_high_t)cospi_4_64; + temp2 = + input[1] * (tran_high_t)cospi_4_64 + input[7] * (tran_high_t)cospi_28_64; step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; - temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + temp1 = + input[5] * (tran_high_t)cospi_12_64 - input[3] * (tran_high_t)cospi_20_64; + temp2 = + input[5] * (tran_high_t)cospi_20_64 + input[3] * (tran_high_t)cospi_12_64; step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); @@ -1609,8 +1621,8 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { // stage 3 - odd half step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; + temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64; + temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64; step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[7] = step2[7]; @@ -1681,10 +1693,11 @@ void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_high_t a1; - tran_low_t out = - HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + tran_low_t out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); - out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 5); for (j = 0; j < 8; ++j) { for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); @@ -1728,22 +1741,22 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { } // stage 1 - s0 = x0 * cospi_1_64 + x1 * cospi_31_64; - s1 = x0 * cospi_31_64 - x1 * cospi_1_64; - s2 = x2 * cospi_5_64 + x3 * cospi_27_64; - s3 = x2 * cospi_27_64 - x3 * cospi_5_64; - s4 = x4 * cospi_9_64 + x5 * cospi_23_64; - s5 = x4 * cospi_23_64 - x5 * cospi_9_64; - s6 = x6 * cospi_13_64 + x7 * cospi_19_64; - s7 = x6 * cospi_19_64 - x7 * cospi_13_64; - s8 = x8 * cospi_17_64 + x9 * cospi_15_64; - s9 = x8 * cospi_15_64 - x9 * cospi_17_64; - s10 = x10 * cospi_21_64 + x11 * cospi_11_64; - s11 = x10 * cospi_11_64 - x11 * cospi_21_64; - s12 = x12 * cospi_25_64 + x13 * cospi_7_64; - s13 = x12 * cospi_7_64 - x13 * cospi_25_64; - s14 = x14 * cospi_29_64 + x15 * cospi_3_64; - s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + s0 = x0 * (tran_high_t)cospi_1_64 + x1 * (tran_high_t)cospi_31_64; + s1 = x0 * (tran_high_t)cospi_31_64 - x1 * (tran_high_t)cospi_1_64; + s2 = x2 * (tran_high_t)cospi_5_64 + x3 * (tran_high_t)cospi_27_64; + s3 = x2 * (tran_high_t)cospi_27_64 - x3 * (tran_high_t)cospi_5_64; + s4 = x4 * (tran_high_t)cospi_9_64 + x5 * (tran_high_t)cospi_23_64; + s5 = x4 * (tran_high_t)cospi_23_64 - x5 * (tran_high_t)cospi_9_64; + s6 = x6 * (tran_high_t)cospi_13_64 + x7 * (tran_high_t)cospi_19_64; + s7 = x6 * (tran_high_t)cospi_19_64 - x7 * (tran_high_t)cospi_13_64; + s8 = x8 * (tran_high_t)cospi_17_64 + x9 * (tran_high_t)cospi_15_64; + s9 = x8 * (tran_high_t)cospi_15_64 - x9 * (tran_high_t)cospi_17_64; + s10 = x10 * (tran_high_t)cospi_21_64 + x11 * (tran_high_t)cospi_11_64; + s11 = x10 * (tran_high_t)cospi_11_64 - x11 * (tran_high_t)cospi_21_64; + s12 = x12 * (tran_high_t)cospi_25_64 + x13 * (tran_high_t)cospi_7_64; + s13 = x12 * (tran_high_t)cospi_7_64 - x13 * (tran_high_t)cospi_25_64; + s14 = x14 * (tran_high_t)cospi_29_64 + x15 * (tran_high_t)cospi_3_64; + s15 = x14 * (tran_high_t)cospi_3_64 - x15 * (tran_high_t)cospi_29_64; x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd); x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd); @@ -1771,14 +1784,14 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { s5 = x5; s6 = x6; s7 = x7; - s8 = x8 * cospi_4_64 + x9 * cospi_28_64; - s9 = x8 * cospi_28_64 - x9 * cospi_4_64; - s10 = x10 * cospi_20_64 + x11 * cospi_12_64; - s11 = x10 * cospi_12_64 - x11 * cospi_20_64; - s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; - s13 = x12 * cospi_4_64 + x13 * cospi_28_64; - s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; - s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + s8 = x8 * (tran_high_t)cospi_4_64 + x9 * (tran_high_t)cospi_28_64; + s9 = x8 * (tran_high_t)cospi_28_64 - x9 * (tran_high_t)cospi_4_64; + s10 = x10 * (tran_high_t)cospi_20_64 + x11 * (tran_high_t)cospi_12_64; + s11 = x10 * (tran_high_t)cospi_12_64 - x11 * (tran_high_t)cospi_20_64; + s12 = -x12 * (tran_high_t)cospi_28_64 + x13 * (tran_high_t)cospi_4_64; + s13 = x12 * (tran_high_t)cospi_4_64 + x13 * (tran_high_t)cospi_28_64; + s14 = -x14 * (tran_high_t)cospi_12_64 + x15 * (tran_high_t)cospi_20_64; + s15 = x14 * (tran_high_t)cospi_20_64 + x15 * (tran_high_t)cospi_12_64; x0 = HIGHBD_WRAPLOW(s0 + s4, bd); x1 = HIGHBD_WRAPLOW(s1 + s5, bd); @@ -1802,18 +1815,18 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { s1 = x1; s2 = x2; s3 = x3; - s4 = x4 * cospi_8_64 + x5 * cospi_24_64; - s5 = x4 * cospi_24_64 - x5 * cospi_8_64; - s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; - s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s4 = x4 * (tran_high_t)cospi_8_64 + x5 * (tran_high_t)cospi_24_64; + s5 = x4 * (tran_high_t)cospi_24_64 - x5 * (tran_high_t)cospi_8_64; + s6 = -x6 * (tran_high_t)cospi_24_64 + x7 * (tran_high_t)cospi_8_64; + s7 = x6 * (tran_high_t)cospi_8_64 + x7 * (tran_high_t)cospi_24_64; s8 = x8; s9 = x9; s10 = x10; s11 = x11; - s12 = x12 * cospi_8_64 + x13 * cospi_24_64; - s13 = x12 * cospi_24_64 - x13 * cospi_8_64; - s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; - s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + s12 = x12 * (tran_high_t)cospi_8_64 + x13 * (tran_high_t)cospi_24_64; + s13 = x12 * (tran_high_t)cospi_24_64 - x13 * (tran_high_t)cospi_8_64; + s14 = -x14 * (tran_high_t)cospi_24_64 + x15 * (tran_high_t)cospi_8_64; + s15 = x14 * (tran_high_t)cospi_8_64 + x15 * (tran_high_t)cospi_24_64; x0 = HIGHBD_WRAPLOW(s0 + s2, bd); x1 = HIGHBD_WRAPLOW(s1 + s3, bd); @@ -1833,14 +1846,14 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd); // stage 4 - s2 = (-cospi_16_64) * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (-x6 + x7); - s10 = cospi_16_64 * (x10 + x11); - s11 = cospi_16_64 * (-x10 + x11); - s14 = (-cospi_16_64) * (x14 + x15); - s15 = cospi_16_64 * (x14 - x15); + s2 = (tran_high_t)(-cospi_16_64) * (x2 + x3); + s3 = (tran_high_t)cospi_16_64 * (x2 - x3); + s6 = (tran_high_t)cospi_16_64 * (x6 + x7); + s7 = (tran_high_t)cospi_16_64 * (-x6 + x7); + s10 = (tran_high_t)cospi_16_64 * (x10 + x11); + s11 = (tran_high_t)cospi_16_64 * (-x10 + x11); + s14 = (tran_high_t)(-cospi_16_64) * (x14 + x15); + s15 = (tran_high_t)cospi_16_64 * (x14 - x15); x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); @@ -1910,23 +1923,31 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step2[6] = step1[6]; step2[7] = step1[7]; - temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; - temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + temp1 = + step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64; + temp2 = + step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64; step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; - temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + temp1 = step1[9] * (tran_high_t)cospi_14_64 - + step1[14] * (tran_high_t)cospi_18_64; + temp2 = step1[9] * (tran_high_t)cospi_18_64 + + step1[14] * (tran_high_t)cospi_14_64; step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; - temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + temp1 = step1[10] * (tran_high_t)cospi_22_64 - + step1[13] * (tran_high_t)cospi_10_64; + temp2 = step1[10] * (tran_high_t)cospi_10_64 + + step1[13] * (tran_high_t)cospi_22_64; step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; - temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + temp1 = step1[11] * (tran_high_t)cospi_6_64 - + step1[12] * (tran_high_t)cospi_26_64; + temp2 = step1[11] * (tran_high_t)cospi_26_64 + + step1[12] * (tran_high_t)cospi_6_64; step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); @@ -1936,12 +1957,16 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step1[2] = step2[2]; step1[3] = step2[3]; - temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; - temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + temp1 = + step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64; + temp2 = + step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64; step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; - temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + temp1 = + step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64; + temp2 = + step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64; step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); @@ -1955,12 +1980,14 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); // stage 4 - temp1 = (step1[0] + step1[1]) * cospi_16_64; - temp2 = (step1[0] - step1[1]) * cospi_16_64; + temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64; + temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64; step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + temp1 = + step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64; + temp2 = + step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64; step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); @@ -1970,12 +1997,16 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step2[8] = step1[8]; step2[15] = step1[15]; - temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; - temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + temp1 = -step1[9] * (tran_high_t)cospi_8_64 + + step1[14] * (tran_high_t)cospi_24_64; + temp2 = + step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64; step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; - temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + temp1 = -step1[10] * (tran_high_t)cospi_24_64 - + step1[13] * (tran_high_t)cospi_8_64; + temp2 = -step1[10] * (tran_high_t)cospi_8_64 + + step1[13] * (tran_high_t)cospi_24_64; step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[11] = step1[11]; @@ -1987,8 +2018,8 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; + temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64; + temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64; step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[7] = step2[7]; @@ -2013,12 +2044,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); step2[8] = step1[8]; step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * cospi_16_64; - temp2 = (step1[10] + step1[13]) * cospi_16_64; + temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64; + temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64; step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = (-step1[11] + step1[12]) * cospi_16_64; - temp2 = (step1[11] + step1[12]) * cospi_16_64; + temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64; + temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64; step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[14] = step1[14]; @@ -2126,10 +2157,11 @@ void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_high_t a1; - tran_low_t out = - HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + tran_low_t out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); - out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 16; ++j) { for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); @@ -2169,43 +2201,59 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[14] = input[14]; step1[15] = input[30]; - temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; - temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; + temp1 = + input[1] * (tran_high_t)cospi_31_64 - input[31] * (tran_high_t)cospi_1_64; + temp2 = + input[1] * (tran_high_t)cospi_1_64 + input[31] * (tran_high_t)cospi_31_64; step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; - temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; + temp1 = input[17] * (tran_high_t)cospi_15_64 - + input[15] * (tran_high_t)cospi_17_64; + temp2 = input[17] * (tran_high_t)cospi_17_64 + + input[15] * (tran_high_t)cospi_15_64; step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; - temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; + temp1 = + input[9] * (tran_high_t)cospi_23_64 - input[23] * (tran_high_t)cospi_9_64; + temp2 = + input[9] * (tran_high_t)cospi_9_64 + input[23] * (tran_high_t)cospi_23_64; step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; - temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; + temp1 = + input[25] * (tran_high_t)cospi_7_64 - input[7] * (tran_high_t)cospi_25_64; + temp2 = + input[25] * (tran_high_t)cospi_25_64 + input[7] * (tran_high_t)cospi_7_64; step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; - temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; + temp1 = + input[5] * (tran_high_t)cospi_27_64 - input[27] * (tran_high_t)cospi_5_64; + temp2 = + input[5] * (tran_high_t)cospi_5_64 + input[27] * (tran_high_t)cospi_27_64; step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; - temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; + temp1 = input[21] * (tran_high_t)cospi_11_64 - + input[11] * (tran_high_t)cospi_21_64; + temp2 = input[21] * (tran_high_t)cospi_21_64 + + input[11] * (tran_high_t)cospi_11_64; step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; - temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; + temp1 = input[13] * (tran_high_t)cospi_19_64 - + input[19] * (tran_high_t)cospi_13_64; + temp2 = input[13] * (tran_high_t)cospi_13_64 + + input[19] * (tran_high_t)cospi_19_64; step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; - temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; + temp1 = + input[29] * (tran_high_t)cospi_3_64 - input[3] * (tran_high_t)cospi_29_64; + temp2 = + input[29] * (tran_high_t)cospi_29_64 + input[3] * (tran_high_t)cospi_3_64; step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); @@ -2219,23 +2267,31 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step2[6] = step1[6]; step2[7] = step1[7]; - temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; - temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + temp1 = + step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64; + temp2 = + step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64; step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; - temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + temp1 = step1[9] * (tran_high_t)cospi_14_64 - + step1[14] * (tran_high_t)cospi_18_64; + temp2 = step1[9] * (tran_high_t)cospi_18_64 + + step1[14] * (tran_high_t)cospi_14_64; step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; - temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + temp1 = step1[10] * (tran_high_t)cospi_22_64 - + step1[13] * (tran_high_t)cospi_10_64; + temp2 = step1[10] * (tran_high_t)cospi_10_64 + + step1[13] * (tran_high_t)cospi_22_64; step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; - temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + temp1 = step1[11] * (tran_high_t)cospi_6_64 - + step1[12] * (tran_high_t)cospi_26_64; + temp2 = step1[11] * (tran_high_t)cospi_26_64 + + step1[12] * (tran_high_t)cospi_6_64; step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); @@ -2262,12 +2318,16 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[2] = step2[2]; step1[3] = step2[3]; - temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; - temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + temp1 = + step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64; + temp2 = + step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64; step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; - temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + temp1 = + step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64; + temp2 = + step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64; step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); @@ -2282,22 +2342,30 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[16] = step2[16]; step1[31] = step2[31]; - temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; - temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; + temp1 = -step2[17] * (tran_high_t)cospi_4_64 + + step2[30] * (tran_high_t)cospi_28_64; + temp2 = step2[17] * (tran_high_t)cospi_28_64 + + step2[30] * (tran_high_t)cospi_4_64; step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; - temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; + temp1 = -step2[18] * (tran_high_t)cospi_28_64 - + step2[29] * (tran_high_t)cospi_4_64; + temp2 = -step2[18] * (tran_high_t)cospi_4_64 + + step2[29] * (tran_high_t)cospi_28_64; step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[19] = step2[19]; step1[20] = step2[20]; - temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; - temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; + temp1 = -step2[21] * (tran_high_t)cospi_20_64 + + step2[26] * (tran_high_t)cospi_12_64; + temp2 = step2[21] * (tran_high_t)cospi_12_64 + + step2[26] * (tran_high_t)cospi_20_64; step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; - temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; + temp1 = -step2[22] * (tran_high_t)cospi_12_64 - + step2[25] * (tran_high_t)cospi_20_64; + temp2 = -step2[22] * (tran_high_t)cospi_20_64 + + step2[25] * (tran_high_t)cospi_12_64; step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[23] = step2[23]; @@ -2306,12 +2374,14 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[28] = step2[28]; // stage 4 - temp1 = (step1[0] + step1[1]) * cospi_16_64; - temp2 = (step1[0] - step1[1]) * cospi_16_64; + temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64; + temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64; step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + temp1 = + step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64; + temp2 = + step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64; step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); @@ -2321,12 +2391,16 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step2[8] = step1[8]; step2[15] = step1[15]; - temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; - temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + temp1 = -step1[9] * (tran_high_t)cospi_8_64 + + step1[14] * (tran_high_t)cospi_24_64; + temp2 = + step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64; step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; - temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + temp1 = -step1[10] * (tran_high_t)cospi_24_64 - + step1[13] * (tran_high_t)cospi_8_64; + temp2 = -step1[10] * (tran_high_t)cospi_8_64 + + step1[13] * (tran_high_t)cospi_24_64; step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[11] = step1[11]; @@ -2356,8 +2430,8 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; + temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64; + temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64; step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[7] = step2[7]; @@ -2373,20 +2447,28 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[16] = step2[16]; step1[17] = step2[17]; - temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; - temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; + temp1 = -step2[18] * (tran_high_t)cospi_8_64 + + step2[29] * (tran_high_t)cospi_24_64; + temp2 = step2[18] * (tran_high_t)cospi_24_64 + + step2[29] * (tran_high_t)cospi_8_64; step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; - temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; + temp1 = -step2[19] * (tran_high_t)cospi_8_64 + + step2[28] * (tran_high_t)cospi_24_64; + temp2 = step2[19] * (tran_high_t)cospi_24_64 + + step2[28] * (tran_high_t)cospi_8_64; step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; - temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; + temp1 = -step2[20] * (tran_high_t)cospi_24_64 - + step2[27] * (tran_high_t)cospi_8_64; + temp2 = -step2[20] * (tran_high_t)cospi_8_64 + + step2[27] * (tran_high_t)cospi_24_64; step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; - temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; + temp1 = -step2[21] * (tran_high_t)cospi_24_64 - + step2[26] * (tran_high_t)cospi_8_64; + temp2 = -step2[21] * (tran_high_t)cospi_8_64 + + step2[26] * (tran_high_t)cospi_24_64; step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[22] = step2[22]; @@ -2407,12 +2489,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); step2[8] = step1[8]; step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * cospi_16_64; - temp2 = (step1[10] + step1[13]) * cospi_16_64; + temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64; + temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64; step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = (-step1[11] + step1[12]) * cospi_16_64; - temp2 = (step1[11] + step1[12]) * cospi_16_64; + temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64; + temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64; step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[14] = step1[14]; @@ -2458,20 +2540,20 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[17] = step2[17]; step1[18] = step2[18]; step1[19] = step2[19]; - temp1 = (-step2[20] + step2[27]) * cospi_16_64; - temp2 = (step2[20] + step2[27]) * cospi_16_64; + temp1 = (-step2[20] + step2[27]) * (tran_high_t)cospi_16_64; + temp2 = (step2[20] + step2[27]) * (tran_high_t)cospi_16_64; step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = (-step2[21] + step2[26]) * cospi_16_64; - temp2 = (step2[21] + step2[26]) * cospi_16_64; + temp1 = (-step2[21] + step2[26]) * (tran_high_t)cospi_16_64; + temp2 = (step2[21] + step2[26]) * (tran_high_t)cospi_16_64; step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = (-step2[22] + step2[25]) * cospi_16_64; - temp2 = (step2[22] + step2[25]) * cospi_16_64; + temp1 = (-step2[22] + step2[25]) * (tran_high_t)cospi_16_64; + temp2 = (step2[22] + step2[25]) * (tran_high_t)cospi_16_64; step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = (-step2[23] + step2[24]) * cospi_16_64; - temp2 = (step2[23] + step2[24]) * cospi_16_64; + temp1 = (-step2[23] + step2[24]) * (tran_high_t)cospi_16_64; + temp2 = (step2[23] + step2[24]) * (tran_high_t)cospi_16_64; step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[28] = step2[28]; @@ -2603,10 +2685,11 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; int a1; - tran_low_t out = - HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + tran_low_t out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); - out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 32; ++j) { diff --git a/libvpx/vpx_dsp/mips/avg_msa.c b/libvpx/vpx_dsp/mips/avg_msa.c index 48b841969..d0ac7b8e2 100644 --- a/libvpx/vpx_dsp/mips/avg_msa.c +++ b/libvpx/vpx_dsp/mips/avg_msa.c @@ -56,7 +56,8 @@ uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) { return sum_out; } -void vpx_hadamard_8x8_msa(const int16_t *src, int src_stride, int16_t *dst) { +void vpx_hadamard_8x8_msa(const int16_t *src, ptrdiff_t src_stride, + int16_t *dst) { v8i16 src0, src1, src2, src3, src4, src5, src6, src7; v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; @@ -80,7 +81,8 @@ void vpx_hadamard_8x8_msa(const int16_t *src, int src_stride, int16_t *dst) { ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 8); } -void vpx_hadamard_16x16_msa(const int16_t *src, int src_stride, int16_t *dst) { +void vpx_hadamard_16x16_msa(const int16_t *src, ptrdiff_t src_stride, + int16_t *dst) { v8i16 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; v8i16 src11, src12, src13, src14, src15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; v8i16 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; diff --git a/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c b/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c index ae88eddfd..18e7d5375 100644 --- a/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c +++ b/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c @@ -219,9 +219,10 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src, void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; uint32_t pos = 38; assert(y_step_q4 == 16); @@ -247,8 +248,8 @@ void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, h); break; default: - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c b/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c index e944207b6..7dcb662d7 100644 --- a/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c +++ b/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c @@ -751,9 +751,10 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr, void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; uint32_t pos = 38; assert(x_step_q4 == 16); @@ -793,8 +794,8 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, h); break; default: - vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c b/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c index 5cc06b5f2..9e65a8f50 100644 --- a/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c +++ b/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c @@ -628,9 +628,10 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; uint32_t pos = 38; assert(x_step_q4 == 16); @@ -672,8 +673,8 @@ void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, filter_x, (int32_t)h); break; default: - vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c b/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c index eb1975e44..a3e967b40 100644 --- a/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c +++ b/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c @@ -201,9 +201,10 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride, void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; uint32_t pos = 38; assert(y_step_q4 == 16); @@ -228,8 +229,8 @@ void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); break; default: - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c b/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c index b4ed6ee85..d9c2bef69 100644 --- a/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c +++ b/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c @@ -334,15 +334,16 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride, void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; assert(y_step_q4 == 16); assert(((const int32_t *)filter_y)[1] != 0x800000); if (((const int32_t *)filter_y)[0] == 0) { - vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); } else { uint32_t pos = 38; @@ -367,8 +368,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, h); break; default: - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } @@ -376,8 +377,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { /* Fixed size intermediate buffer places limits on parameters. */ DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); @@ -390,24 +391,26 @@ void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, if (intermediate_height < h) intermediate_height = h; - vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x, - x_step_q4, filter_y, y_step_q4, w, intermediate_height); + vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, + intermediate_height); - vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, int w, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { int x, y; uint32_t tp1, tp2, tn1, tp3, tp4, tn2; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; /* prefetch data to cache memory */ prefetch_load(src); diff --git a/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c b/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c index 9a9bab25a..fb68ad881 100644 --- a/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c +++ b/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c @@ -938,15 +938,16 @@ static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr, void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; assert(x_step_q4 == 16); assert(((const int32_t *)filter_x)[1] != 0x800000); if (((const int32_t *)filter_x)[0] == 0) { - vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); } else { uint32_t pos = 38; @@ -987,9 +988,8 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, h); break; default: - vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, - h); + vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libvpx/vpx_dsp/mips/convolve8_dspr2.c b/libvpx/vpx_dsp/mips/convolve8_dspr2.c index 8d35b6394..89f0f4196 100644 --- a/libvpx/vpx_dsp/mips/convolve8_dspr2.c +++ b/libvpx/vpx_dsp/mips/convolve8_dspr2.c @@ -1296,9 +1296,11 @@ void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, } void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; uint32_t pos = 38; @@ -1395,14 +1397,15 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h) { + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { int x, y; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; /* prefetch data to cache memory */ prefetch_load(src); diff --git a/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c b/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c index 196a0a2f0..77e95c844 100644 --- a/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c +++ b/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c @@ -818,15 +818,16 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride, void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; assert(x_step_q4 == 16); assert(((const int32_t *)filter_x)[1] != 0x800000); if (((const int32_t *)filter_x)[0] == 0) { - vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } else { uint32_t pos = 38; @@ -868,8 +869,8 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, filter_x, (int32_t)h); break; default: - vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c b/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c index ad107d5c4..c329f71cc 100644 --- a/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c +++ b/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c @@ -318,15 +318,16 @@ static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride, void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; assert(y_step_q4 == 16); assert(((const int32_t *)filter_y)[1] != 0x800000); if (((const int32_t *)filter_y)[0] == 0) { - vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } else { uint32_t pos = 38; @@ -349,8 +350,8 @@ void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); break; default: - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libvpx/vpx_dsp/mips/convolve_common_dspr2.h b/libvpx/vpx_dsp/mips/convolve_common_dspr2.h index 4eee3bd5e..48e440d73 100644 --- a/libvpx/vpx_dsp/mips/convolve_common_dspr2.h +++ b/libvpx/vpx_dsp/mips/convolve_common_dspr2.h @@ -24,21 +24,21 @@ extern "C" { #if HAVE_DSPR2 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h); + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h); void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h); void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h); + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h); void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter, int w, @@ -46,9 +46,9 @@ void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h); + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h); #endif // #if HAVE_DSPR2 #ifdef __cplusplus diff --git a/libvpx/vpx_dsp/mips/fwd_txfm_msa.c b/libvpx/vpx_dsp/mips/fwd_txfm_msa.c index f786664bb..5a6dfcef2 100644 --- a/libvpx/vpx_dsp/mips/fwd_txfm_msa.c +++ b/libvpx/vpx_dsp/mips/fwd_txfm_msa.c @@ -8,8 +8,23 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/fwd_txfm_msa.h" +void vpx_fdct8x8_1_msa(const int16_t *input, tran_low_t *out, int32_t stride) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v4i32 vec_w; + + LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7); + ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6); + ADD2(in0, in2, in4, in6, in0, in4); + vec_w = __msa_hadd_s_w(in0, in0); + vec_w += __msa_hadd_s_w(in4, in4); + out[0] = HADD_SW_S32(vec_w); + out[1] = 0; +} + +#if !CONFIG_VP9_HIGHBITDEPTH void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, int32_t src_stride) { v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; @@ -215,19 +230,6 @@ void vpx_fdct8x8_msa(const int16_t *input, int16_t *output, ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); } -void vpx_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v4i32 vec_w; - - LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7); - ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6); - ADD2(in0, in2, in4, in6, in0, in4); - vec_w = __msa_hadd_s_w(in0, in0); - vec_w += __msa_hadd_s_w(in4, in4); - out[0] = HADD_SW_S32(vec_w); - out[1] = 0; -} - void vpx_fdct16x16_msa(const int16_t *input, int16_t *output, int32_t src_stride) { int32_t i; @@ -267,3 +269,4 @@ void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) { sum = HADD_SW_S32(vec_w); out[0] = (int16_t)(sum >> 1); } +#endif // !CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/vpx_dsp/mips/itrans4_dspr2.c b/libvpx/vpx_dsp/mips/itrans4_dspr2.c index 3f985b847..e214b538d 100644 --- a/libvpx/vpx_dsp/mips/itrans4_dspr2.c +++ b/libvpx/vpx_dsp/mips/itrans4_dspr2.c @@ -343,6 +343,7 @@ void iadst4_dspr2(const int16_t *input, int16_t *output) { return; } + // 32-bit result is enough for the following multiplications. s0 = sinpi_1_9 * x0; s1 = sinpi_2_9 * x0; s2 = sinpi_3_9 * x1; diff --git a/libvpx/vpx_dsp/mips/loopfilter_16_msa.c b/libvpx/vpx_dsp/mips/loopfilter_16_msa.c index b73d56bd5..b1731f234 100644 --- a/libvpx/vpx_dsp/mips/loopfilter_16_msa.c +++ b/libvpx/vpx_dsp/mips/loopfilter_16_msa.c @@ -8,13 +8,15 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vpx_ports/mem.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/loopfilter_msa.h" +#include "vpx_ports/mem.h" -int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { +static int32_t hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, + uint8_t *filter48, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; v16u8 flat, mask, hev, thresh, b_limit, limit; @@ -77,7 +79,7 @@ int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48, } } -void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { +static void hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { v16u8 flat, flat2, filter8; v16i8 zero = { 0 }; v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; @@ -413,11 +415,11 @@ static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch, (void)count; - early_exit = vpx_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr, - limit_ptr, thresh_ptr); + early_exit = hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr, + limit_ptr, thresh_ptr); if (0 == early_exit) { - vpx_hz_lpf_t16_16w(src, pitch, filter48); + hz_lpf_t16_16w(src, pitch, filter48); } } @@ -753,11 +755,11 @@ static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output, ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); } -int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, - uint8_t *src_org, int32_t pitch_org, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { +static int32_t vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, + uint8_t *src_org, int32_t pitch_org, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; v16u8 flat, mask, hev, thresh, b_limit, limit; @@ -820,8 +822,8 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, } } -int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch, - uint8_t *filter48) { +static int32_t vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch, + uint8_t *filter48) { v16i8 zero = { 0 }; v16u8 filter8, flat, flat2; v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; @@ -1051,12 +1053,12 @@ void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch, transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16); early_exit = - vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, - pitch, b_limit_ptr, limit_ptr, thresh_ptr); + vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, pitch, + b_limit_ptr, limit_ptr, thresh_ptr); if (0 == early_exit) { - early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, - &filter48[0]); + early_exit = + vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, &filter48[0]); if (0 == early_exit) { transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch); @@ -1064,11 +1066,11 @@ void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch, } } -int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, - uint8_t *src_org, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { +static int32_t vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, + uint8_t *src_org, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; v16u8 flat, mask, hev, thresh, b_limit, limit; @@ -1141,8 +1143,8 @@ int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, } } -int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch, - uint8_t *filter48) { +static int32_t vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch, + uint8_t *filter48) { v16u8 flat, flat2, filter8; v16i8 zero = { 0 }; v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; @@ -1473,12 +1475,12 @@ void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch, transpose_16x16((src - 8), pitch, &transposed_input[0], 16); early_exit = - vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src, - pitch, b_limit_ptr, limit_ptr, thresh_ptr); + vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src, + pitch, b_limit_ptr, limit_ptr, thresh_ptr); if (0 == early_exit) { - early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, - &filter48[0]); + early_exit = + vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]); if (0 == early_exit) { transpose_16x16(transposed_input, 16, (src - 8), pitch); diff --git a/libvpx/vpx_dsp/mips/loopfilter_4_msa.c b/libvpx/vpx_dsp/mips/loopfilter_4_msa.c index 9500cd2fd..0eff2b6ca 100644 --- a/libvpx/vpx_dsp/mips/loopfilter_4_msa.c +++ b/libvpx/vpx_dsp/mips/loopfilter_4_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/loopfilter_msa.h" void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch, diff --git a/libvpx/vpx_dsp/mips/loopfilter_8_msa.c b/libvpx/vpx_dsp/mips/loopfilter_8_msa.c index a22c62bb3..703fcce8a 100644 --- a/libvpx/vpx_dsp/mips/loopfilter_8_msa.c +++ b/libvpx/vpx_dsp/mips/loopfilter_8_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/loopfilter_msa.h" void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, diff --git a/libvpx/vpx_dsp/mips/macros_msa.h b/libvpx/vpx_dsp/mips/macros_msa.h index 27b38865a..f9a446e7b 100644 --- a/libvpx/vpx_dsp/mips/macros_msa.h +++ b/libvpx/vpx_dsp/mips/macros_msa.h @@ -16,207 +16,149 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" -#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) -#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) -#define LD_SB(...) LD_B(v16i8, __VA_ARGS__) - -#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc)) -#define LD_UH(...) LD_H(v8u16, __VA_ARGS__) -#define LD_SH(...) LD_H(v8i16, __VA_ARGS__) - -#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) -#define LD_SW(...) LD_W(v4i32, __VA_ARGS__) - -#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) -#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) -#define ST_SB(...) ST_B(v16i8, __VA_ARGS__) - -#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) -#define ST_SH(...) ST_H(v8i16, __VA_ARGS__) - -#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) -#define ST_SW(...) ST_W(v4i32, __VA_ARGS__) +#define LD_V(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_UB(...) LD_V(v16u8, __VA_ARGS__) +#define LD_SB(...) LD_V(v16i8, __VA_ARGS__) +#define LD_UH(...) LD_V(v8u16, __VA_ARGS__) +#define LD_SH(...) LD_V(v8i16, __VA_ARGS__) +#define LD_SW(...) LD_V(v4i32, __VA_ARGS__) + +#define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_UB(...) ST_V(v16u8, __VA_ARGS__) +#define ST_SB(...) ST_V(v16i8, __VA_ARGS__) +#define ST_SH(...) ST_V(v8i16, __VA_ARGS__) +#define ST_SW(...) ST_V(v4i32, __VA_ARGS__) #if (__mips_isa_rev >= 6) -#define LH(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint16_t val_m; \ - \ - __asm__ __volatile__("lh %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LH(psrc) \ + ({ \ + uint16_t val_lh_m = *(const uint16_t *)(psrc); \ + val_lh_m; \ }) -#define LW(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - __asm__ __volatile__("lw %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LW(psrc) \ + ({ \ + uint32_t val_lw_m = *(const uint32_t *)(psrc); \ + val_lw_m; \ }) #if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - __asm__ __volatile__("ld %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + uint64_t val_ld_m = *(const uint64_t *)(psrc); \ + val_ld_m; \ }) #else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ - \ - val0_m = LW(psrc_m); \ - val1_m = LW(psrc_m + 4); \ - \ - val_m = (uint64_t)(val1_m); \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ + uint32_t val0_ld_m, val1_ld_m; \ + uint64_t val_ld_m = 0; \ + \ + val0_ld_m = LW(psrc_ld_m); \ + val1_ld_m = LW(psrc_ld_m + 4); \ + \ + val_ld_m = (uint64_t)(val1_ld_m); \ + val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \ + val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m); \ + \ + val_ld_m; \ }) #endif // (__mips == 64) -#define SH(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint16_t val_m = (val); \ - \ - __asm__ __volatile__("sh %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ - } - -#define SW(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - __asm__ __volatile__("sw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ - } - -#define SD(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint64_t val_m = (val); \ - \ - __asm__ __volatile__("sd %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ - } +#define SH(val, pdst) *(uint16_t *)(pdst) = (val); +#define SW(val, pdst) *(uint32_t *)(pdst) = (val); +#define SD(val, pdst) *(uint64_t *)(pdst) = (val); #else // !(__mips_isa_rev >= 6) -#define LH(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint16_t val_m; \ - \ - __asm__ __volatile__("ulh %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LH(psrc) \ + ({ \ + const uint8_t *psrc_lh_m = (const uint8_t *)(psrc); \ + uint16_t val_lh_m; \ + \ + __asm__ __volatile__("ulh %[val_lh_m], %[psrc_lh_m] \n\t" \ + \ + : [val_lh_m] "=r"(val_lh_m) \ + : [psrc_lh_m] "m"(*psrc_lh_m)); \ + \ + val_lh_m; \ }) -#define LW(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - __asm__ __volatile__("ulw %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LW(psrc) \ + ({ \ + const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \ + uint32_t val_lw_m; \ + \ + __asm__ __volatile__("ulw %[val_lw_m], %[psrc_lw_m] \n\t" \ + \ + : [val_lw_m] "=r"(val_lw_m) \ + : [psrc_lw_m] "m"(*psrc_lw_m)); \ + \ + val_lw_m; \ }) #if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - __asm__ __volatile__("uld %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ + uint64_t val_ld_m = 0; \ + \ + __asm__ __volatile__("uld %[val_ld_m], %[psrc_ld_m] \n\t" \ + \ + : [val_ld_m] "=r"(val_ld_m) \ + : [psrc_ld_m] "m"(*psrc_ld_m)); \ + \ + val_ld_m; \ }) #else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m_combined = 0; \ - \ - val0_m = LW(psrc_m1); \ - val1_m = LW(psrc_m1 + 4); \ - \ - val_m_combined = (uint64_t)(val1_m); \ - val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \ - val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m); \ - \ - val_m_combined; \ +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ + uint32_t val0_ld_m, val1_ld_m; \ + uint64_t val_ld_m = 0; \ + \ + val0_ld_m = LW(psrc_ld_m); \ + val1_ld_m = LW(psrc_ld_m + 4); \ + \ + val_ld_m = (uint64_t)(val1_ld_m); \ + val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \ + val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m); \ + \ + val_ld_m; \ }) #endif // (__mips == 64) -#define SH(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint16_t val_m = (val); \ - \ - __asm__ __volatile__("ush %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SH(val, pdst) \ + { \ + uint8_t *pdst_sh_m = (uint8_t *)(pdst); \ + const uint16_t val_sh_m = (val); \ + \ + __asm__ __volatile__("ush %[val_sh_m], %[pdst_sh_m] \n\t" \ + \ + : [pdst_sh_m] "=m"(*pdst_sh_m) \ + : [val_sh_m] "r"(val_sh_m)); \ } -#define SW(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - __asm__ __volatile__("usw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SW(val, pdst) \ + { \ + uint8_t *pdst_sw_m = (uint8_t *)(pdst); \ + const uint32_t val_sw_m = (val); \ + \ + __asm__ __volatile__("usw %[val_sw_m], %[pdst_sw_m] \n\t" \ + \ + : [pdst_sw_m] "=m"(*pdst_sw_m) \ + : [val_sw_m] "r"(val_sw_m)); \ } -#define SD(val, pdst) \ - { \ - uint8_t *pdst_m1 = (uint8_t *)(pdst); \ - uint32_t val0_m, val1_m; \ - \ - val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ - val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ - \ - SW(val0_m, pdst_m1); \ - SW(val1_m, pdst_m1 + 4); \ +#define SD(val, pdst) \ + { \ + uint8_t *pdst_sd_m = (uint8_t *)(pdst); \ + uint32_t val0_sd_m, val1_sd_m; \ + \ + val0_sd_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ + val1_sd_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + \ + SW(val0_sd_m, pdst_sd_m); \ + SW(val1_sd_m, pdst_sd_m + 4); \ } #endif // (__mips_isa_rev >= 6) @@ -283,97 +225,73 @@ SD(in3, (pdst) + 3 * stride); \ } -/* Description : Load vectors with 16 byte elements with stride +/* Description : Load vector elements with stride Arguments : Inputs - psrc, stride Outputs - out0, out1 Return Type - as per RTYPE Details : Load 16 byte elements in 'out0' from (psrc) Load 16 byte elements in 'out1' from (psrc + stride) */ -#define LD_B2(RTYPE, psrc, stride, out0, out1) \ +#define LD_V2(RTYPE, psrc, stride, out0, out1) \ { \ - out0 = LD_B(RTYPE, (psrc)); \ - out1 = LD_B(RTYPE, (psrc) + stride); \ + out0 = LD_V(RTYPE, (psrc)); \ + out1 = LD_V(RTYPE, (psrc) + stride); \ } -#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) -#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) +#define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__) +#define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__) +#define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__) +#define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__) -#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \ +#define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \ { \ - LD_B2(RTYPE, (psrc), stride, out0, out1); \ - out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ + LD_V2(RTYPE, (psrc), stride, out0, out1); \ + out2 = LD_V(RTYPE, (psrc) + 2 * stride); \ } -#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) +#define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__) -#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ +#define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \ { \ - LD_B2(RTYPE, (psrc), stride, out0, out1); \ - LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + LD_V2(RTYPE, (psrc), stride, out0, out1); \ + LD_V2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ } -#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) -#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) +#define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__) +#define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__) +#define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__) -#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ +#define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ { \ - LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ + LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + out4 = LD_V(RTYPE, (psrc) + 4 * stride); \ } -#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) -#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) +#define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__) +#define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__) -#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \ +#define LD_V7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \ { \ - LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ - LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ + LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ + LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ } -#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) +#define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__) -#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ +#define LD_V8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ out7) \ { \ - LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ - } -#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) -#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) - -/* Description : Load vectors with 8 halfword elements with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1 - Details : Load 8 halfword elements in 'out0' from (psrc) - Load 8 halfword elements in 'out1' from (psrc + stride) -*/ -#define LD_H2(RTYPE, psrc, stride, out0, out1) \ - { \ - out0 = LD_H(RTYPE, (psrc)); \ - out1 = LD_H(RTYPE, (psrc) + (stride)); \ - } -#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) - -#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \ - { \ - LD_H2(RTYPE, (psrc), stride, out0, out1); \ - LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ } -#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) +#define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__) +#define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__) +#define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__) -#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ - out7) \ - { \ - LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ - } -#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) - -#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ +#define LD_V16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ out7, out8, out9, out10, out11, out12, out13, out14, out15) \ { \ - LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \ + LD_V8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \ out7); \ - LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \ + LD_V8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \ out13, out14, out15); \ } -#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) +#define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__) /* Description : Load 4x4 block of signed halfword elements from 1D source data into 4 vectors (Each vector with 4 signed halfwords) @@ -388,79 +306,35 @@ out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ } -/* Description : Load 2 vectors of signed word elements with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1 - Return Type - signed word -*/ -#define LD_SW2(psrc, stride, out0, out1) \ - { \ - out0 = LD_SW((psrc)); \ - out1 = LD_SW((psrc) + stride); \ - } - -/* Description : Store vectors of 16 byte elements with stride +/* Description : Store vectors with stride Arguments : Inputs - in0, in1, pdst, stride Details : Store 16 byte elements from 'in0' to (pdst) Store 16 byte elements from 'in1' to (pdst + stride) */ -#define ST_B2(RTYPE, in0, in1, pdst, stride) \ - { \ - ST_B(RTYPE, in0, (pdst)); \ - ST_B(RTYPE, in1, (pdst) + stride); \ - } -#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) - -#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ - { \ - ST_B2(RTYPE, in0, in1, (pdst), stride); \ - ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ - } -#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) - -#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ - { \ - ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ - ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ - } -#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) - -/* Description : Store vectors of 8 halfword elements with stride - Arguments : Inputs - in0, in1, pdst, stride - Details : Store 8 halfword elements from 'in0' to (pdst) - Store 8 halfword elements from 'in1' to (pdst + stride) -*/ -#define ST_H2(RTYPE, in0, in1, pdst, stride) \ +#define ST_V2(RTYPE, in0, in1, pdst, stride) \ { \ - ST_H(RTYPE, in0, (pdst)); \ - ST_H(RTYPE, in1, (pdst) + stride); \ + ST_V(RTYPE, in0, (pdst)); \ + ST_V(RTYPE, in1, (pdst) + stride); \ } -#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) +#define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__) +#define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__) +#define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__) -#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \ +#define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \ { \ - ST_H2(RTYPE, in0, in1, (pdst), stride); \ - ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ + ST_V2(RTYPE, in0, in1, (pdst), stride); \ + ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ } -#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) +#define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__) +#define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__) -#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ +#define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ { \ - ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ - ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ - } -#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) - -/* Description : Store vectors of word elements with stride - Arguments : Inputs - in0, in1, pdst, stride - Details : Store 4 word elements from 'in0' to (pdst) - Store 4 word elements from 'in1' to (pdst + stride) -*/ -#define ST_SW2(in0, in1, pdst, stride) \ - { \ - ST_SW(in0, (pdst)); \ - ST_SW(in1, (pdst) + stride); \ + ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride); \ + ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ } +#define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__) +#define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__) /* Description : Store 2x4 byte block to destination memory from input vector Arguments : Inputs - in, stidx, pdst, stride @@ -681,6 +555,7 @@ #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) +#define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__) #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \ out3) \ @@ -1308,6 +1183,7 @@ out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ } #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) +#define ILVRL_W2_SB(...) ILVRL_W2(v16i8, __VA_ARGS__) #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) @@ -1721,6 +1597,25 @@ out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ } +/* Description : Sign extend byte elements from input vector and return + halfword results in pair of vectors + Arguments : Input - in (byte vector) + Outputs - out0, out1 (sign extended halfword vectors) + Return Type - signed halfword + Details : Sign bit of byte elements from input vector 'in' is + extracted and interleaved right with same vector 'in0' to + generate 8 signed halfword elements in 'out0' + Then interleaved left with same vector 'in0' to + generate 8 signed halfword elements in 'out1' +*/ +#define UNPCK_SB_SH(in, out0, out1) \ + { \ + v16i8 tmp_m; \ + \ + tmp_m = __msa_clti_s_b((v16i8)in, 0); \ + ILVRL_B2_SH(tmp_m, in, out0, out1); \ + } + /* Description : Zero extend unsigned byte elements to halfword elements Arguments : Input - in (unsigned byte vector) Outputs - out0, out1 (unsigned halfword vectors) @@ -1879,8 +1774,6 @@ out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ \ tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ - tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ - tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ @@ -2034,19 +1927,17 @@ /* Description : Converts inputs to unsigned bytes, interleave, average & store as 8x4 unsigned byte block - Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3, - pdst, stride + Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, pdst, stride */ -#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \ - pdst, stride) \ - { \ - v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - \ - tmp0_m = PCKEV_XORI128_UB(in0, in1); \ - tmp1_m = PCKEV_XORI128_UB(in2, in3); \ - ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ - AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ - ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ +#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \ + { \ + v16u8 tmp0_m, tmp1_m; \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + \ + tmp0_m = PCKEV_XORI128_UB(in0, in1); \ + tmp1_m = PCKEV_XORI128_UB(in2, in3); \ + AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ } /* Description : Pack even byte elements and store byte vector in destination diff --git a/libvpx/vpx_dsp/mips/sad_mmi.c b/libvpx/vpx_dsp/mips/sad_mmi.c new file mode 100644 index 000000000..33bd3fe7f --- /dev/null +++ b/libvpx/vpx_dsp/mips/sad_mmi.c @@ -0,0 +1,805 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/asmdefs_mmi.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +#define SAD_SRC_REF_ABS_SUB_64 \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x27(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x20(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x2f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x28(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x27(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x20(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x2f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x28(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x37(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x30(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x3f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x38(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x37(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x30(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x3f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x38(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_REF_ABS_SUB_32 \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_REF_ABS_SUB_16 \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_REF_ABS_SUB_8 \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp2], 0x00(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + +#if _MIPS_SIM == _ABIO32 +#define SAD_SRC_REF_ABS_SUB_4 \ + "ulw %[tmp0], 0x00(%[src]) \n\t" \ + "mtc1 %[tmp0], %[ftmp1] \n\t" \ + "ulw %[tmp0], 0x00(%[ref]) \n\t" \ + "mtc1 %[tmp0], %[ftmp2] \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "mthc1 $0, %[ftmp1] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" +#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */ +#define SAD_SRC_REF_ABS_SUB_4 \ + "gslwlc1 %[ftmp1], 0x03(%[src]) \n\t" \ + "gslwrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gslwlc1 %[ftmp2], 0x03(%[ref]) \n\t" \ + "gslwrc1 %[ftmp2], 0x00(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "mthc1 $0, %[ftmp1] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" +#endif /* _MIPS_SIM == _ABIO32 */ + +#define SAD_SRC_AVGREF_ABS_SUB_64 \ + "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x27(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x20(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x2f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x28(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x27(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x20(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x2f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x28(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x27(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x20(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x2f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x28(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x37(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x30(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x3f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x38(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x37(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x30(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x3f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x38(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x37(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x30(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x3f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x38(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_AVGREF_ABS_SUB_32 \ + "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_AVGREF_ABS_SUB_16 \ + "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_AVGREF_ABS_SUB_8 \ + "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp2], 0x00(%[ref]) \n\t" \ + "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + +#if _MIPS_SIM == _ABIO32 +#define SAD_SRC_AVGREF_ABS_SUB_4 \ + "ulw %[tmp0], 0x00(%[second_pred]) \n\t" \ + "mtc1 %[tmp0], %[ftmp1] \n\t" \ + "ulw %[tmp0], 0x00(%[ref]) \n\t" \ + "mtc1 %[tmp0], %[ftmp2] \n\t" \ + "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "mthc1 $0, %[ftmp1] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" +#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */ +#define SAD_SRC_AVGREF_ABS_SUB_4 \ + "gslwlc1 %[ftmp1], 0x03(%[second_pred]) \n\t" \ + "gslwrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \ + "gslwlc1 %[ftmp2], 0x03(%[ref]) \n\t" \ + "gslwrc1 %[ftmp2], 0x00(%[ref]) \n\t" \ + "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "mthc1 $0, %[ftmp1] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" +#endif /* _MIPS_SIM == _ABIO32 */ + +// depending on call sites, pass **ref_array to avoid & in subsequent call and +// de-dup with 4D below. +#define sadMxNxK_mmi(m, n, k) \ + void vpx_sad##m##x##n##x##k##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref_array, int ref_stride, \ + uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < k; ++i) \ + sad_array[i] = \ + vpx_sad##m##x##n##_mmi(src, src_stride, &ref_array[i], ref_stride); \ + } + +// This appears to be equivalent to the above when k == 4 and refs is const +#define sadMxNx4D_mmi(m, n) \ + void vpx_sad##m##x##n##x4d_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < 4; ++i) \ + sad_array[i] = \ + vpx_sad##m##x##n##_mmi(src, src_stride, ref_array[i], ref_stride); \ + } + +static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + + __asm__ volatile ( + "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_REF_ABS_SUB_64 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_REF_ABS_SUB_64 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + + return sad; +} + +#define vpx_sad64xN(H) \ + unsigned int vpx_sad64x##H##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return vpx_sad64x(src, src_stride, ref, ref_stride, H); \ + } + +vpx_sad64xN(64); +vpx_sad64xN(32); +sadMxNx4D_mmi(64, 64); +sadMxNx4D_mmi(64, 32); + +static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + + __asm__ volatile ( + "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_AVGREF_ABS_SUB_64 + MMI_ADDIU(%[second_pred], %[second_pred], 0x40) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_AVGREF_ABS_SUB_64 + MMI_ADDIU(%[second_pred], %[second_pred], 0x40) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), + [second_pred]"+&r"((mips_reg)second_pred), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + + return sad; +} + +#define vpx_sad_avg64xN(H) \ + unsigned int vpx_sad64x##H##_avg_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return vpx_sad_avg64x(src, src_stride, ref, ref_stride, second_pred, H); \ + } + +vpx_sad_avg64xN(64); +vpx_sad_avg64xN(32); + +static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + + __asm__ volatile ( + "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_REF_ABS_SUB_32 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_REF_ABS_SUB_32 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + + return sad; +} + +#define vpx_sad32xN(H) \ + unsigned int vpx_sad32x##H##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return vpx_sad32x(src, src_stride, ref, ref_stride, H); \ + } + +vpx_sad32xN(64); +vpx_sad32xN(32); +vpx_sad32xN(16); +sadMxNx4D_mmi(32, 64); +sadMxNx4D_mmi(32, 32); +sadMxNx4D_mmi(32, 16); + +static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + + __asm__ volatile ( + "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_AVGREF_ABS_SUB_32 + MMI_ADDIU(%[second_pred], %[second_pred], 0x20) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_AVGREF_ABS_SUB_32 + MMI_ADDIU(%[second_pred], %[second_pred], 0x20) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), + [second_pred]"+&r"((mips_reg)second_pred), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + + return sad; +} + +#define vpx_sad_avg32xN(H) \ + unsigned int vpx_sad32x##H##_avg_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return vpx_sad_avg32x(src, src_stride, ref, ref_stride, second_pred, H); \ + } + +vpx_sad_avg32xN(64); +vpx_sad_avg32xN(32); +vpx_sad_avg32xN(16); + +static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + + __asm__ volatile ( + "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_REF_ABS_SUB_16 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_REF_ABS_SUB_16 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + + return sad; +} + +#define vpx_sad16xN(H) \ + unsigned int vpx_sad16x##H##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return vpx_sad16x(src, src_stride, ref, ref_stride, H); \ + } + +vpx_sad16xN(32); +vpx_sad16xN(16); +vpx_sad16xN(8); +sadMxNxK_mmi(16, 16, 3); +sadMxNxK_mmi(16, 16, 8); +sadMxNxK_mmi(16, 8, 3); +sadMxNxK_mmi(16, 8, 8); +sadMxNx4D_mmi(16, 32); +sadMxNx4D_mmi(16, 16); +sadMxNx4D_mmi(16, 8); + +static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + + __asm__ volatile ( + "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_AVGREF_ABS_SUB_16 + MMI_ADDIU(%[second_pred], %[second_pred], 0x10) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_AVGREF_ABS_SUB_16 + MMI_ADDIU(%[second_pred], %[second_pred], 0x10) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), + [second_pred]"+&r"((mips_reg)second_pred), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + + return sad; +} + +#define vpx_sad_avg16xN(H) \ + unsigned int vpx_sad16x##H##_avg_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return vpx_sad_avg16x(src, src_stride, ref, ref_stride, second_pred, H); \ + } + +vpx_sad_avg16xN(32); +vpx_sad_avg16xN(16); +vpx_sad_avg16xN(8); + +static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3; + mips_reg l_counter = counter; + + __asm__ volatile ( + "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_REF_ABS_SUB_8 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_REF_ABS_SUB_8 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp3] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + + return sad; +} + +#define vpx_sad8xN(H) \ + unsigned int vpx_sad8x##H##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return vpx_sad8x(src, src_stride, ref, ref_stride, H); \ + } + +vpx_sad8xN(16); +vpx_sad8xN(8); +vpx_sad8xN(4); +sadMxNxK_mmi(8, 16, 3); +sadMxNxK_mmi(8, 16, 8); +sadMxNxK_mmi(8, 8, 3); +sadMxNxK_mmi(8, 8, 8); +sadMxNx4D_mmi(8, 16); +sadMxNx4D_mmi(8, 8); +sadMxNx4D_mmi(8, 4); + +static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3; + mips_reg l_counter = counter; + + __asm__ volatile ( + "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_AVGREF_ABS_SUB_8 + MMI_ADDIU(%[second_pred], %[second_pred], 0x08) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_AVGREF_ABS_SUB_8 + MMI_ADDIU(%[second_pred], %[second_pred], 0x08) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp3] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), + [second_pred]"+&r"((mips_reg)second_pred), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + + return sad; +} + +#define vpx_sad_avg8xN(H) \ + unsigned int vpx_sad8x##H##_avg_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return vpx_sad_avg8x(src, src_stride, ref, ref_stride, second_pred, H); \ + } + +vpx_sad_avg8xN(16); +vpx_sad_avg8xN(8); +vpx_sad_avg8xN(4); + +static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3; + mips_reg l_counter = counter; + + __asm__ volatile ( + "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_REF_ABS_SUB_4 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_REF_ABS_SUB_4 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp3] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + + return sad; +} + +#define vpx_sad4xN(H) \ + unsigned int vpx_sad4x##H##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return vpx_sad4x(src, src_stride, ref, ref_stride, H); \ + } + +vpx_sad4xN(8); +vpx_sad4xN(4); +sadMxNxK_mmi(4, 4, 3); +sadMxNxK_mmi(4, 4, 8); +sadMxNx4D_mmi(4, 8); +sadMxNx4D_mmi(4, 4); + +static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3; + mips_reg l_counter = counter; + + __asm__ volatile ( + "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_AVGREF_ABS_SUB_4 + MMI_ADDIU(%[second_pred], %[second_pred], 0x04) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_AVGREF_ABS_SUB_4 + MMI_ADDIU(%[second_pred], %[second_pred], 0x04) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp3] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), + [second_pred]"+&r"((mips_reg)second_pred), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + + return sad; +} + +#define vpx_sad_avg4xN(H) \ + unsigned int vpx_sad4x##H##_avg_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return vpx_sad_avg4x(src, src_stride, ref, ref_stride, second_pred, H); \ + } + +vpx_sad_avg4xN(8); +vpx_sad_avg4xN(4); diff --git a/libvpx/vpx_dsp/mips/sad_msa.c b/libvpx/vpx_dsp/mips/sad_msa.c index e295123ac..ab681ae9f 100644 --- a/libvpx/vpx_dsp/mips/sad_msa.c +++ b/libvpx/vpx_dsp/mips/sad_msa.c @@ -283,96 +283,6 @@ static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, sad_array[2] = HADD_UH_U32(sad2); } -static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - - for (ht_cnt = height >> 1; ht_cnt--;) { - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); - ref += ref_stride; - - sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); - ref += ref_stride; - - sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); -} - -static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3; - v8u16 sad0_0 = { 0 }; - v8u16 sad0_1 = { 0 }; - v8u16 sad1_0 = { 0 }; - v8u16 sad1_1 = { 0 }; - v8u16 sad2_0 = { 0 }; - v8u16 sad2_1 = { 0 }; - v4u32 sad; - - for (ht_cnt = height; ht_cnt--;) { - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3); - ref0_4 = LD_UB(ref + 64); - ref += ref_stride; - - sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1); - sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2); - sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - } - - sad = __msa_hadd_u_w(sad0_0, sad0_0); - sad += __msa_hadd_u_w(sad0_1, sad0_1); - sad_array[0] = HADD_SW_S32((v4i32)sad); - - sad = __msa_hadd_u_w(sad1_0, sad1_0); - sad += __msa_hadd_u_w(sad1_1, sad1_1); - sad_array[1] = HADD_SW_S32((v4i32)sad); - - sad = __msa_hadd_u_w(sad2_0, sad2_0); - sad += __msa_hadd_u_w(sad2_1, sad2_1); - sad_array[2] = HADD_SW_S32((v4i32)sad); -} - static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *ref_ptr, int32_t ref_stride, int32_t height, uint32_t *sad_array) { @@ -623,176 +533,6 @@ static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, sad_array[7] = HADD_UH_U32(sad7); } -static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1; - v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - v8u16 sad4 = { 0 }; - v8u16 sad5 = { 0 }; - v8u16 sad6 = { 0 }; - v8u16 sad7 = { 0 }; - - for (ht_cnt = height; ht_cnt--;) { - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); - ref += ref_stride; - - sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3); - sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4); - sad4 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5); - sad5 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6); - sad6 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7); - sad7 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); - sad_array[4] = HADD_UH_U32(sad4); - sad_array[5] = HADD_UH_U32(sad5); - sad_array[6] = HADD_UH_U32(sad6); - sad_array[7] = HADD_UH_U32(sad7); -} - -static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - const uint8_t *src_dup, *ref_dup; - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4; - v16u8 ref0, ref1, ref2, ref3; - v8u16 sad0_0 = { 0 }; - v8u16 sad0_1 = { 0 }; - v8u16 sad1_0 = { 0 }; - v8u16 sad1_1 = { 0 }; - v8u16 sad2_0 = { 0 }; - v8u16 sad2_1 = { 0 }; - v8u16 sad3_0 = { 0 }; - v8u16 sad3_1 = { 0 }; - v4u32 sad; - - src_dup = src; - ref_dup = ref; - - for (ht_cnt = height; ht_cnt--;) { - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4); - ref += ref_stride; - - sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1); - sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2); - sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3); - sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - } - - sad = __msa_hadd_u_w(sad0_0, sad0_0); - sad += __msa_hadd_u_w(sad0_1, sad0_1); - sad_array[0] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad1_0, sad1_0); - sad += __msa_hadd_u_w(sad1_1, sad1_1); - sad_array[1] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad2_0, sad2_0); - sad += __msa_hadd_u_w(sad2_1, sad2_1); - sad_array[2] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad3_0, sad3_0); - sad += __msa_hadd_u_w(sad3_1, sad3_1); - sad_array[3] = HADD_SW_S32(sad); - - sad0_0 = (v8u16)__msa_ldi_h(0); - sad0_1 = (v8u16)__msa_ldi_h(0); - sad1_0 = (v8u16)__msa_ldi_h(0); - sad1_1 = (v8u16)__msa_ldi_h(0); - sad2_0 = (v8u16)__msa_ldi_h(0); - sad2_1 = (v8u16)__msa_ldi_h(0); - sad3_0 = (v8u16)__msa_ldi_h(0); - sad3_1 = (v8u16)__msa_ldi_h(0); - - for (ht_cnt = 64; ht_cnt--;) { - LD_UB4(src_dup, 16, src0, src1, src2, src3); - src_dup += src_stride; - LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4); - ref_dup += ref_stride; - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4); - sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5); - sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6); - sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7); - sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - } - - sad = __msa_hadd_u_w(sad0_0, sad0_0); - sad += __msa_hadd_u_w(sad0_1, sad0_1); - sad_array[4] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad1_0, sad1_0); - sad += __msa_hadd_u_w(sad1_1, sad1_1); - sad_array[5] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad2_0, sad2_0); - sad += __msa_hadd_u_w(sad2_1, sad2_1); - sad_array[6] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad3_0, sad3_0); - sad += __msa_hadd_u_w(sad3_1, sad3_1); - sad_array[7] = HADD_SW_S32(sad); -} - static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *const aref_ptr[], int32_t ref_stride, int32_t height, @@ -1318,20 +1058,6 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ } -#define VPX_SAD_32xHEIGHTx3_MSA(height) \ - void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define VPX_SAD_64xHEIGHTx3_MSA(height) \ - void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - #define VPX_SAD_4xHEIGHTx8_MSA(height) \ void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ const uint8_t *ref, int32_t ref_stride, \ @@ -1353,20 +1079,6 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ } -#define VPX_SAD_32xHEIGHTx8_MSA(height) \ - void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define VPX_SAD_64xHEIGHTx8_MSA(height) \ - void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - #define VPX_SAD_4xHEIGHTx4D_MSA(height) \ void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ const uint8_t *const refs[], \ @@ -1444,43 +1156,31 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, // 64x64 VPX_SAD_64xHEIGHT_MSA(64); -VPX_SAD_64xHEIGHTx3_MSA(64); -VPX_SAD_64xHEIGHTx8_MSA(64); VPX_SAD_64xHEIGHTx4D_MSA(64); VPX_AVGSAD_64xHEIGHT_MSA(64); // 64x32 VPX_SAD_64xHEIGHT_MSA(32); -VPX_SAD_64xHEIGHTx3_MSA(32); -VPX_SAD_64xHEIGHTx8_MSA(32); VPX_SAD_64xHEIGHTx4D_MSA(32); VPX_AVGSAD_64xHEIGHT_MSA(32); // 32x64 VPX_SAD_32xHEIGHT_MSA(64); -VPX_SAD_32xHEIGHTx3_MSA(64); -VPX_SAD_32xHEIGHTx8_MSA(64); VPX_SAD_32xHEIGHTx4D_MSA(64); VPX_AVGSAD_32xHEIGHT_MSA(64); // 32x32 VPX_SAD_32xHEIGHT_MSA(32); -VPX_SAD_32xHEIGHTx3_MSA(32); -VPX_SAD_32xHEIGHTx8_MSA(32); VPX_SAD_32xHEIGHTx4D_MSA(32); VPX_AVGSAD_32xHEIGHT_MSA(32); // 32x16 VPX_SAD_32xHEIGHT_MSA(16); -VPX_SAD_32xHEIGHTx3_MSA(16); -VPX_SAD_32xHEIGHTx8_MSA(16); VPX_SAD_32xHEIGHTx4D_MSA(16); VPX_AVGSAD_32xHEIGHT_MSA(16); // 16x32 VPX_SAD_16xHEIGHT_MSA(32); -VPX_SAD_16xHEIGHTx3_MSA(32); -VPX_SAD_16xHEIGHTx8_MSA(32); VPX_SAD_16xHEIGHTx4D_MSA(32); VPX_AVGSAD_16xHEIGHT_MSA(32); @@ -1514,15 +1214,11 @@ VPX_AVGSAD_8xHEIGHT_MSA(8); // 8x4 VPX_SAD_8xHEIGHT_MSA(4); -VPX_SAD_8xHEIGHTx3_MSA(4); -VPX_SAD_8xHEIGHTx8_MSA(4); VPX_SAD_8xHEIGHTx4D_MSA(4); VPX_AVGSAD_8xHEIGHT_MSA(4); // 4x8 VPX_SAD_4xHEIGHT_MSA(8); -VPX_SAD_4xHEIGHTx3_MSA(8); -VPX_SAD_4xHEIGHTx8_MSA(8); VPX_SAD_4xHEIGHTx4D_MSA(8); VPX_AVGSAD_4xHEIGHT_MSA(8); diff --git a/libvpx/vpx_dsp/mips/subtract_mmi.c b/libvpx/vpx_dsp/mips/subtract_mmi.c new file mode 100644 index 000000000..9f361704a --- /dev/null +++ b/libvpx/vpx_dsp/mips/subtract_mmi.c @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/asmdefs_mmi.h" + +void vpx_subtract_block_mmi(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, + ptrdiff_t pred_stride) { + double ftmp[13]; + uint32_t tmp[1]; + + if (rows == cols) { + switch (rows) { + case 4: + __asm__ volatile( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[src]) \n\t" + "mtc1 %[tmp0], %[ftmp1] \n\t" + "ulw %[tmp0], 0x00(%[pred]) \n\t" + "mtc1 %[tmp0], %[ftmp2] \n\t" +#else + "gslwlc1 %[ftmp1], 0x03(%[src]) \n\t" + "gslwrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gslwlc1 %[ftmp2], 0x03(%[pred]) \n\t" + "gslwrc1 %[ftmp2], 0x00(%[pred]) \n\t" +#endif + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[src]) \n\t" + "mtc1 %[tmp0], %[ftmp3] \n\t" + "ulw %[tmp0], 0x00(%[pred]) \n\t" + "mtc1 %[tmp0], %[ftmp4] \n\t" +#else + "gslwlc1 %[ftmp3], 0x03(%[src]) \n\t" + "gslwrc1 %[ftmp3], 0x00(%[src]) \n\t" + "gslwlc1 %[ftmp4], 0x03(%[pred]) \n\t" + "gslwrc1 %[ftmp4], 0x00(%[pred]) \n\t" +#endif + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[src]) \n\t" + "mtc1 %[tmp0], %[ftmp5] \n\t" + "ulw %[tmp0], 0x00(%[pred]) \n\t" + "mtc1 %[tmp0], %[ftmp6] \n\t" +#else + "gslwlc1 %[ftmp5], 0x03(%[src]) \n\t" + "gslwrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gslwlc1 %[ftmp6], 0x03(%[pred]) \n\t" + "gslwrc1 %[ftmp6], 0x00(%[pred]) \n\t" +#endif + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[src]) \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" + "ulw %[tmp0], 0x00(%[pred]) \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" +#else + "gslwlc1 %[ftmp7], 0x03(%[src]) \n\t" + "gslwrc1 %[ftmp7], 0x00(%[src]) \n\t" + "gslwlc1 %[ftmp8], 0x03(%[pred]) \n\t" + "gslwrc1 %[ftmp8], 0x00(%[pred]) \n\t" +#endif + "punpcklbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp2], %[ftmp0] \n\t" + "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t" + "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp4], %[ftmp0] \n\t" + "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t" + "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp6], %[ftmp0] \n\t" + "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t" + "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp8], %[ftmp0] \n\t" + "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t" + "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), + [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]), + [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), + [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]), + [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]), +#if _MIPS_SIM == _ABIO32 + [tmp0] "=&r"(tmp[0]), +#endif + [src] "+&r"(src), [pred] "+&r"(pred), [diff] "+&r"(diff) + : [src_stride] "r"((mips_reg)src_stride), + [pred_stride] "r"((mips_reg)pred_stride), + [diff_stride] "r"((mips_reg)(diff_stride * 2)) + : "memory"); + break; + case 8: + __asm__ volatile( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "li %[tmp0], 0x02 \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "gsldlc1 %[ftmp3], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp4], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp6], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "gsldlc1 %[ftmp7], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp8], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "punpcklbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp2], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp2], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp3], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp4], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp4], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp6], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp6], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp8], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp8], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + "bnez %[tmp0], 1b \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), + [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]), + [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), + [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]), + [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]), [src] "+&r"(src), + [pred] "+&r"(pred), [diff] "+&r"(diff) + : [pred_stride] "r"((mips_reg)pred_stride), + [src_stride] "r"((mips_reg)src_stride), + [diff_stride] "r"((mips_reg)(diff_stride * 2)) + : "memory"); + break; + case 16: + __asm__ volatile( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "li %[tmp0], 0x08 \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[pred]) \n\t" + "gsldlc1 %[ftmp3], 0x0f(%[src]) \n\t" + "gsldrc1 %[ftmp3], 0x08(%[src]) \n\t" + "gsldlc1 %[ftmp4], 0x0f(%[pred]) \n\t" + "gsldrc1 %[ftmp4], 0x08(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp6], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[pred]) \n\t" + "gsldlc1 %[ftmp7], 0x0f(%[src]) \n\t" + "gsldrc1 %[ftmp7], 0x08(%[src]) \n\t" + "gsldlc1 %[ftmp8], 0x0f(%[pred]) \n\t" + "gsldrc1 %[ftmp8], 0x08(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "punpcklbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp2], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp2], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + "punpcklbh %[ftmp9], %[ftmp3], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp4], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp4], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x17(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x10(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x1f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x18(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp6], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp6], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp8], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp8], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x17(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x10(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x1f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x18(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + "bnez %[tmp0], 1b \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), + [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]), + [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), + [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]), + [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]), [src] "+&r"(src), + [pred] "+&r"(pred), [diff] "+&r"(diff) + : [pred_stride] "r"((mips_reg)pred_stride), + [src_stride] "r"((mips_reg)src_stride), + [diff_stride] "r"((mips_reg)(diff_stride * 2)) + : "memory"); + break; + case 32: + vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, + pred, pred_stride); + break; + case 64: + vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, + pred, pred_stride); + break; + default: + vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, + pred, pred_stride); + break; + } + } else { + vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, pred, + pred_stride); + } +} diff --git a/libvpx/vpx_dsp/mips/variance_mmi.c b/libvpx/vpx_dsp/mips/variance_mmi.c new file mode 100644 index 000000000..4af60d363 --- /dev/null +++ b/libvpx/vpx_dsp/mips/variance_mmi.c @@ -0,0 +1,1280 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/variance.h" +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/asmdefs_mmi.h" + +static const uint8_t bilinear_filters[8][2] = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, +}; + +/* Use VARIANCE_SSE_SUM_8_FOR_W64 in vpx_variance64x64,vpx_variance64x32, + vpx_variance32x64. VARIANCE_SSE_SUM_8 will lead to sum overflow. */ +#define VARIANCE_SSE_SUM_8_FOR_W64 \ + /* sse */ \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ + "paddw %[ftmp10], %[ftmp10], %[ftmp6] \n\t" \ + "paddw %[ftmp10], %[ftmp10], %[ftmp7] \n\t" \ + \ + /* sum */ \ + "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp2], %[ftmp3], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \ + "psubw %[ftmp3], %[ftmp1], %[ftmp7] \n\t" \ + "psubw %[ftmp5], %[ftmp2], %[ftmp8] \n\t" \ + "punpcklhw %[ftmp1], %[ftmp4], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp2], %[ftmp4], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \ + "psubw %[ftmp4], %[ftmp1], %[ftmp7] \n\t" \ + "psubw %[ftmp6], %[ftmp2], %[ftmp8] \n\t" \ + "paddw %[ftmp9], %[ftmp9], %[ftmp3] \n\t" \ + "paddw %[ftmp9], %[ftmp9], %[ftmp4] \n\t" \ + "paddw %[ftmp9], %[ftmp9], %[ftmp5] \n\t" \ + "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t" + +#define VARIANCE_SSE_SUM_4 \ + /* sse */ \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp5], %[ftmp4], %[ftmp4] \n\t" \ + "paddw %[ftmp6], %[ftmp6], %[ftmp5] \n\t" \ + \ + /* sum */ \ + "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \ + "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" + +#define VARIANCE_SSE_SUM_8 \ + /* sse */ \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" \ + \ + /* sum */ \ + "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ftmp3] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t" \ + "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" \ + "paddh %[ftmp12], %[ftmp12], %[ftmp6] \n\t" + +#define VARIANCE_SSE_8 \ + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ + "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" \ + "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + +#define VARIANCE_SSE_16 \ + VARIANCE_SSE_8 \ + "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A \ + /* calculate fdata3[0]~fdata3[3], store at ftmp2*/ \ + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \ + "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp3], %[ftmp3], %[filter_x1] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ + "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B \ + /* calculate fdata3[0]~fdata3[3], store at ftmp4*/ \ + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ + "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \ + "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ + "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A \ + /* calculate: temp2[0] ~ temp2[3] */ \ + "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \ + "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" \ + \ + /* store: temp2[0] ~ temp2[3] */ \ + "and %[ftmp2], %[ftmp2], %[mask] \n\t" \ + "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ + "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B \ + /* calculate: temp2[0] ~ temp2[3] */ \ + "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \ + "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \ + \ + /* store: temp2[0] ~ temp2[3] */ \ + "and %[ftmp4], %[ftmp4], %[mask] \n\t" \ + "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \ + "gssdrc1 %[ftmp4], 0x00(%[temp2_ptr]) \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \ + /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \ + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \ + "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \ + "pmullh %[ftmp3], %[ftmp3], %[filter_x0] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ + "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp4], %[ftmp4], %[filter_x1] \n\t" \ + "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ + "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \ + "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \ + /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \ + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ + "punpcklbh %[ftmp8], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \ + "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp8], %[ftmp8], %[filter_x0] \n\t" \ + "pmullh %[ftmp9], %[ftmp9], %[filter_x0] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \ + "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp10], %[ftmp10], %[filter_x1] \n\t" \ + "pmullh %[ftmp11], %[ftmp11], %[filter_x1] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" \ + "paddh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" \ + "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \ + "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \ + /* calculate: temp2[0] ~ temp2[3] */ \ + "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp8], %[filter_y1] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \ + "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \ + \ + /* calculate: temp2[4] ~ temp2[7] */ \ + "pmullh %[ftmp3], %[ftmp3], %[filter_y0] \n\t" \ + "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp9], %[filter_y1] \n\t" \ + "paddh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" \ + "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" \ + \ + /* store: temp2[0] ~ temp2[7] */ \ + "and %[ftmp2], %[ftmp2], %[mask] \n\t" \ + "and %[ftmp3], %[ftmp3], %[mask] \n\t" \ + "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ + "gssdlc1 %[ftmp2], 0x07(%[temp2_ptr]) \n\t" \ + "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \ + /* calculate: temp2[0] ~ temp2[3] */ \ + "pmullh %[ftmp8], %[ftmp8], %[filter_y0] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" \ + "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \ + \ + /* calculate: temp2[4] ~ temp2[7] */ \ + "pmullh %[ftmp9], %[ftmp9], %[filter_y0] \n\t" \ + "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp3], %[filter_y1] \n\t" \ + "paddh %[ftmp9], %[ftmp9], %[ftmp1] \n\t" \ + "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" \ + \ + /* store: temp2[0] ~ temp2[7] */ \ + "and %[ftmp8], %[ftmp8], %[mask] \n\t" \ + "and %[ftmp9], %[ftmp9], %[mask] \n\t" \ + "packushb %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "gssdlc1 %[ftmp8], 0x07(%[temp2_ptr]) \n\t" \ + "gssdrc1 %[ftmp8], 0x00(%[temp2_ptr]) \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A \ + /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \ + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \ + \ + /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/ \ + "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \ + "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x10(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x09(%[a]) \n\t" \ + "punpcklbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \ + "pmullh %[ftmp5], %[ftmp5], %[filter_x0] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ + "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp6], %[ftmp6], %[filter_x1] \n\t" \ + "pmullh %[ftmp7], %[ftmp7], %[filter_x1] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \ + "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \ + "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \ + "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B \ + /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \ + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \ + \ + /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/ \ + "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \ + "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x10(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x09(%[a]) \n\t" \ + "punpcklbh %[ftmp12], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp13], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp10], %[ftmp10], %[filter_x0] \n\t" \ + "pmullh %[ftmp11], %[ftmp11], %[filter_x0] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \ + "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp12], %[ftmp12], %[filter_x1] \n\t" \ + "pmullh %[ftmp13], %[ftmp13], %[filter_x1] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" \ + "paddh %[ftmp11], %[ftmp11], %[ftmp13] \n\t" \ + "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \ + "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A \ + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \ + \ + /* calculate: temp2[8] ~ temp2[11] */ \ + "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp10], %[filter_y1] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \ + "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \ + \ + /* calculate: temp2[12] ~ temp2[15] */ \ + "pmullh %[ftmp5], %[ftmp5], %[filter_y0] \n\t" \ + "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp11], %[filter_y1] \n\t" \ + "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" \ + \ + /* store: temp2[8] ~ temp2[15] */ \ + "and %[ftmp4], %[ftmp4], %[mask] \n\t" \ + "and %[ftmp5], %[ftmp5], %[mask] \n\t" \ + "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ + "gssdlc1 %[ftmp4], 0x0f(%[temp2_ptr]) \n\t" \ + "gssdrc1 %[ftmp4], 0x08(%[temp2_ptr]) \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B \ + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \ + \ + /* calculate: temp2[8] ~ temp2[11] */ \ + "pmullh %[ftmp10], %[ftmp10], %[filter_y0] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ftmp1] \n\t" \ + "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \ + \ + /* calculate: temp2[12] ~ temp2[15] */ \ + "pmullh %[ftmp11], %[ftmp11], %[filter_y0] \n\t" \ + "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp5], %[filter_y1] \n\t" \ + "paddh %[ftmp11], %[ftmp11], %[ftmp1] \n\t" \ + "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" \ + \ + /* store: temp2[8] ~ temp2[15] */ \ + "and %[ftmp10], %[ftmp10], %[mask] \n\t" \ + "and %[ftmp11], %[ftmp11], %[mask] \n\t" \ + "packushb %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ + "gssdlc1 %[ftmp10], 0x0f(%[temp2_ptr]) \n\t" \ + "gssdrc1 %[ftmp10], 0x08(%[temp2_ptr]) \n\t" + +// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// the first-pass of 2-D separable filter. +// +// Produces int16_t output to retain precision for the next pass. Two filter +// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is +// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). +// It defines the offset required to move from one input to the next. +static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + b[j] = ROUND_POWER_OF_TWO( + (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); + + ++a; + } + + a += src_pixels_per_line - output_width; + b += output_width; + } +} + +// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// the second-pass of 2-D separable filter. +// +// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two +// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the +// filter is applied horizontally (pixel_step = 1) or vertically +// (pixel_step = stride). It defines the offset required to move from one input +// to the next. Output is 8-bit. +static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + b[j] = ROUND_POWER_OF_TWO( + (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); + ++a; + } + + a += src_pixels_per_line - output_width; + b += output_width; + } +} + +static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse, int high) { + int sum; + double ftmp[12]; + uint32_t tmp[3]; + + *sse = 0; + + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x27(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x20(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x27(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x20(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x2f(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x28(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x2f(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x28(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x37(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x30(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x37(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x30(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x3f(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x38(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x3f(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x38(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[b], %[b], %[b_stride]) + "bnez %[tmp0], 1b \n\t" + + "mfc1 %[tmp1], %[ftmp9] \n\t" + "mfhc1 %[tmp2], %[ftmp9] \n\t" + "addu %[sum], %[tmp1], %[tmp2] \n\t" + "dsrl %[ftmp1], %[ftmp10], %[ftmp11] \n\t" + "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "swc1 %[ftmp1], 0x00(%[sse]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [tmp2]"=&r"(tmp[2]), + [a]"+&r"(a), [b]"+&r"(b), + [sum]"=&r"(sum) + : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [high]"r"(&high), [sse]"r"(sse) + : "memory" + ); + + return *sse - (((int64_t)sum * sum) / (64 * high)); +} + +#define VPX_VARIANCE64XN(n) \ + uint32_t vpx_variance64x##n##_mmi(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + return vpx_variance64x(a, a_stride, b, b_stride, sse, n); \ + } + +VPX_VARIANCE64XN(64) +VPX_VARIANCE64XN(32) + +uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, uint32_t *sse) { + int sum; + double ftmp[12]; + uint32_t tmp[3]; + + *sse = 0; + + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + "li %[tmp0], 0x40 \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[b], %[b], %[b_stride]) + "bnez %[tmp0], 1b \n\t" + + "mfc1 %[tmp1], %[ftmp9] \n\t" + "mfhc1 %[tmp2], %[ftmp9] \n\t" + "addu %[sum], %[tmp1], %[tmp2] \n\t" + "dsrl %[ftmp1], %[ftmp10], %[ftmp11] \n\t" + "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "swc1 %[ftmp1], 0x00(%[sse]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [tmp2]"=&r"(tmp[2]), + [a]"+&r"(a), [b]"+&r"(b), + [sum]"=&r"(sum) + : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [sse]"r"(sse) + : "memory" + ); + + return *sse - (((int64_t)sum * sum) / 2048); +} + +static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse, int high) { + int sum; + double ftmp[13]; + uint32_t tmp[3]; + + *sse = 0; + + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + VARIANCE_SSE_SUM_8 + "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" + VARIANCE_SSE_SUM_8 + "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t" + VARIANCE_SSE_SUM_8 + "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t" + VARIANCE_SSE_SUM_8 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[b], %[b], %[b_stride]) + "bnez %[tmp0], 1b \n\t" + + "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + + "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t" + "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t" + "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t" + "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t" + "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "swc1 %[ftmp0], 0x00(%[sum]) \n\t" + + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [a]"+&r"(a), [b]"+&r"(b) + : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) + : "memory" + ); + + return *sse - (((int64_t)sum * sum) / (32 * high)); +} + +#define VPX_VARIANCE32XN(n) \ + uint32_t vpx_variance32x##n##_mmi(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + return vpx_variance32x(a, a_stride, b, b_stride, sse, n); \ + } + +VPX_VARIANCE32XN(32) +VPX_VARIANCE32XN(16) + +static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse, int high) { + int sum; + double ftmp[13]; + uint32_t tmp[3]; + + *sse = 0; + + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + VARIANCE_SSE_SUM_8 + "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" + VARIANCE_SSE_SUM_8 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[b], %[b], %[b_stride]) + "bnez %[tmp0], 1b \n\t" + + "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + + "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t" + "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t" + "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t" + "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t" + "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "swc1 %[ftmp0], 0x00(%[sum]) \n\t" + + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [a]"+&r"(a), [b]"+&r"(b) + : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) + : "memory" + ); + + return *sse - (((int64_t)sum * sum) / (16 * high)); +} + +#define VPX_VARIANCE16XN(n) \ + uint32_t vpx_variance16x##n##_mmi(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + return vpx_variance16x(a, a_stride, b, b_stride, sse, n); \ + } + +VPX_VARIANCE16XN(32) +VPX_VARIANCE16XN(16) +VPX_VARIANCE16XN(8) + +static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse, int high) { + int sum; + double ftmp[13]; + uint32_t tmp[3]; + + *sse = 0; + + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + VARIANCE_SSE_SUM_8 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[b], %[b], %[b_stride]) + "bnez %[tmp0], 1b \n\t" + + "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + + "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t" + "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t" + "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t" + "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t" + "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "swc1 %[ftmp0], 0x00(%[sum]) \n\t" + + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [a]"+&r"(a), [b]"+&r"(b) + : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) + : "memory" + ); + + return *sse - (((int64_t)sum * sum) / (8 * high)); +} + +#define VPX_VARIANCE8XN(n) \ + uint32_t vpx_variance8x##n##_mmi(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + return vpx_variance8x(a, a_stride, b, b_stride, sse, n); \ + } + +VPX_VARIANCE8XN(16) +VPX_VARIANCE8XN(8) +VPX_VARIANCE8XN(4) + +static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse, int high) { + int sum; + double ftmp[12]; + uint32_t tmp[3]; + + *sse = 0; + + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp10] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + VARIANCE_SSE_SUM_4 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[b], %[b], %[b_stride]) + "bnez %[tmp0], 1b \n\t" + + "dsrl %[ftmp9], %[ftmp6], %[ftmp10] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + + "punpcklhw %[ftmp3], %[ftmp7], %[ftmp0] \n\t" + "punpckhhw %[ftmp4], %[ftmp7], %[ftmp0] \n\t" + "punpcklhw %[ftmp5], %[ftmp8], %[ftmp0] \n\t" + "punpckhhw %[ftmp6], %[ftmp8], %[ftmp0] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "dsrl %[ftmp0], %[ftmp3], %[ftmp10] \n\t" + "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "swc1 %[ftmp0], 0x00(%[sum]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), + [tmp0]"=&r"(tmp[0]), + [a]"+&r"(a), [b]"+&r"(b) + : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) + : "memory" + ); + + return *sse - (((int64_t)sum * sum) / (4 * high)); +} + +#define VPX_VARIANCE4XN(n) \ + uint32_t vpx_variance4x##n##_mmi(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + return vpx_variance4x(a, a_stride, b, b_stride, sse, n); \ + } + +VPX_VARIANCE4XN(8) +VPX_VARIANCE4XN(4) + +static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, uint32_t *sse, + uint64_t high) { + double ftmp[12]; + uint32_t tmp[1]; + + *sse = 0; + + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + + "1: \n\t" + VARIANCE_SSE_16 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[b], %[b], %[b_stride]) + "bnez %[tmp0], 1b \n\t" + + "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [a]"+&r"(a), [b]"+&r"(b) + : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [high]"r"(&high), [sse]"r"(sse) + : "memory" + ); + + return *sse; +} + +#define vpx_mse16xN(n) \ + uint32_t vpx_mse16x##n##_mmi(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + return vpx_mse16x(a, a_stride, b, b_stride, sse, n); \ + } + +vpx_mse16xN(16); +vpx_mse16xN(8); + +static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, uint32_t *sse, + uint64_t high) { + double ftmp[12]; + uint32_t tmp[1]; + + *sse = 0; + + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + + "1: \n\t" + VARIANCE_SSE_8 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[b], %[b], %[b_stride]) + "bnez %[tmp0], 1b \n\t" + + "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [a]"+&r"(a), [b]"+&r"(b) + : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [high]"r"(&high), [sse]"r"(sse) + : "memory" + ); + + return *sse; +} + +#define vpx_mse8xN(n) \ + uint32_t vpx_mse8x##n##_mmi(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + return vpx_mse8x(a, a_stride, b, b_stride, sse, n); \ + } + +vpx_mse8xN(16); +vpx_mse8xN(8); + +#define SUBPIX_VAR(W, H) \ + uint32_t vpx_sub_pixel_variance##W##x##H##_mmi( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[yoffset]); \ + \ + return vpx_variance##W##x##H##_mmi(temp2, W, b, b_stride, sse); \ + } + +SUBPIX_VAR(64, 64) +SUBPIX_VAR(64, 32) +SUBPIX_VAR(32, 64) +SUBPIX_VAR(32, 32) +SUBPIX_VAR(32, 16) +SUBPIX_VAR(16, 32) + +static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride, + int xoffset, int yoffset, + uint8_t *temp2, int counter) { + uint8_t *temp2_ptr = temp2; + mips_reg l_counter = counter; + double ftmp[15]; + mips_reg tmp[2]; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; + DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; + + const uint8_t *filter_x = bilinear_filters[xoffset]; + const uint8_t *filter_y = bilinear_filters[yoffset]; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x07) + MMI_MTC1(%[tmp0], %[ftmp14]) + "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t" + "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t" + "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t" + "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t" + + // fdata3: fdata3[0] ~ fdata3[15] + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A + + // fdata3 +a_stride*1: fdata3[0] ~ fdata3[15] + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B + // temp2: temp2[0] ~ temp2[15] + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A + + // fdata3 +a_stride*2: fdata3[0] ~ fdata3[15] + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A + // temp2+16*1: temp2[0] ~ temp2[15] + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B + + "1: \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A + + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B + "addiu %[counter], %[counter], -0x01 \n\t" + "bnez %[counter], 1b \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]), + [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), + [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), + [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]), + [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr), + [counter]"+&r"(l_counter) + : [filter_x0] "f"((uint64_t)filter_x[0]), + [filter_x1] "f"((uint64_t)filter_x[1]), + [filter_y0] "f"((uint64_t)filter_y[0]), + [filter_y1] "f"((uint64_t)filter_y[1]), + [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40), + [mask] "f"(mask) + : "memory" + ); +} + +#define SUBPIX_VAR16XN(H) \ + uint32_t vpx_sub_pixel_variance16x##H##_mmi( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + uint8_t temp2[16 * H]; \ + var_filter_block2d_bil_16x(a, a_stride, xoffset, yoffset, temp2, \ + (H - 2) / 2); \ + \ + return vpx_variance16x##H##_mmi(temp2, 16, b, b_stride, sse); \ + } + +SUBPIX_VAR16XN(16) +SUBPIX_VAR16XN(8) + +static inline void var_filter_block2d_bil_8x(const uint8_t *a, int a_stride, + int xoffset, int yoffset, + uint8_t *temp2, int counter) { + uint8_t *temp2_ptr = temp2; + mips_reg l_counter = counter; + double ftmp[15]; + mips_reg tmp[2]; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; + DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; + const uint8_t *filter_x = bilinear_filters[xoffset]; + const uint8_t *filter_y = bilinear_filters[yoffset]; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x07) + MMI_MTC1(%[tmp0], %[ftmp14]) + "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t" + "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t" + "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t" + "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t" + + // fdata3: fdata3[0] ~ fdata3[7] + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A + + // fdata3 +a_stride*1: fdata3[0] ~ fdata3[7] + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B + // temp2: temp2[0] ~ temp2[7] + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A + + // fdata3 +a_stride*2: fdata3[0] ~ fdata3[7] + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A + // temp2+8*1: temp2[0] ~ temp2[7] + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B + + "1: \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A + + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B + "addiu %[counter], %[counter], -0x01 \n\t" + "bnez %[counter], 1b \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]), + [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), + [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), + [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]), + [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr), + [counter]"+&r"(l_counter) + : [filter_x0] "f"((uint64_t)filter_x[0]), + [filter_x1] "f"((uint64_t)filter_x[1]), + [filter_y0] "f"((uint64_t)filter_y[0]), + [filter_y1] "f"((uint64_t)filter_y[1]), + [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40), + [mask] "f"(mask) + : "memory" + ); +} + +#define SUBPIX_VAR8XN(H) \ + uint32_t vpx_sub_pixel_variance8x##H##_mmi( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + uint8_t temp2[8 * H]; \ + var_filter_block2d_bil_8x(a, a_stride, xoffset, yoffset, temp2, \ + (H - 2) / 2); \ + \ + return vpx_variance8x##H##_mmi(temp2, 8, b, b_stride, sse); \ + } + +SUBPIX_VAR8XN(16) +SUBPIX_VAR8XN(8) +SUBPIX_VAR8XN(4) + +static inline void var_filter_block2d_bil_4x(const uint8_t *a, int a_stride, + int xoffset, int yoffset, + uint8_t *temp2, int counter) { + uint8_t *temp2_ptr = temp2; + mips_reg l_counter = counter; + double ftmp[7]; + mips_reg tmp[2]; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; + DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; + const uint8_t *filter_x = bilinear_filters[xoffset]; + const uint8_t *filter_y = bilinear_filters[yoffset]; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x07) + MMI_MTC1(%[tmp0], %[ftmp6]) + "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t" + "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t" + "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t" + "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t" + // fdata3: fdata3[0] ~ fdata3[3] + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A + + // fdata3 +a_stride*1: fdata3[0] ~ fdata3[3] + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B + // temp2: temp2[0] ~ temp2[7] + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A + + // fdata3 +a_stride*2: fdata3[0] ~ fdata3[3] + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A + // temp2+4*1: temp2[0] ~ temp2[7] + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B + + "1: \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A + + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B + "addiu %[counter], %[counter], -0x01 \n\t" + "bnez %[counter], 1b \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), + [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter) + : [filter_x0] "f"((uint64_t)filter_x[0]), + [filter_x1] "f"((uint64_t)filter_x[1]), + [filter_y0] "f"((uint64_t)filter_y[0]), + [filter_y1] "f"((uint64_t)filter_y[1]), + [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40), + [mask] "f"(mask) + : "memory" + ); +} + +#define SUBPIX_VAR4XN(H) \ + uint32_t vpx_sub_pixel_variance4x##H##_mmi( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + uint8_t temp2[4 * H]; \ + var_filter_block2d_bil_4x(a, a_stride, xoffset, yoffset, temp2, \ + (H - 2) / 2); \ + \ + return vpx_variance4x##H##_mmi(temp2, 4, b, b_stride, sse); \ + } + +SUBPIX_VAR4XN(8) +SUBPIX_VAR4XN(4) + +#define SUBPIX_AVG_VAR(W, H) \ + uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[yoffset]); \ + \ + vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \ + \ + return vpx_variance##W##x##H##_mmi(temp3, W, b, b_stride, sse); \ + } + +SUBPIX_AVG_VAR(64, 64) +SUBPIX_AVG_VAR(64, 32) +SUBPIX_AVG_VAR(32, 64) +SUBPIX_AVG_VAR(32, 32) +SUBPIX_AVG_VAR(32, 16) +SUBPIX_AVG_VAR(16, 32) +SUBPIX_AVG_VAR(16, 16) +SUBPIX_AVG_VAR(16, 8) +SUBPIX_AVG_VAR(8, 16) +SUBPIX_AVG_VAR(8, 8) +SUBPIX_AVG_VAR(8, 4) +SUBPIX_AVG_VAR(4, 8) +SUBPIX_AVG_VAR(4, 4) diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c index ad2af2866..187a01342 100644 --- a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c +++ b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c @@ -16,8 +16,9 @@ static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 dst0, dst1, dst2, dst3, res2, res3; + v16u8 dst0 = { 0 }, res; v16u8 mask0, mask1, mask2, mask3; v8i16 filt, res0, res1; @@ -36,23 +37,23 @@ static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, XORI_B4_128_SB(src0, src1, src2, src3); HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, res0, res1); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); SRARI_H2_SH(res0, res1, FILTER_BITS); SAT_SH2_SH(res0, res1, 7); - PCKEV_B2_UB(res0, res0, res1, res1, res2, res3); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); - XORI_B2_128_UB(res2, res3); - AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); + res = PCKEV_XORI128_UB(res0, res1); + res = (v16u8)__msa_aver_u_b(res, dst0); + ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride); } static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst0 = { 0 }, dst1 = { 0 }; v8i16 filt, vec0, vec1, vec2, vec3; mask0 = LD_UB(&mc_filt_mask_arr[16]); @@ -69,7 +70,10 @@ static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, LD_SB4(src, src_stride, src0, src1, src2, src3); XORI_B4_128_SB(src0, src1, src2, src3); src += (4 * src_stride); - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, vec0, vec1); LD_SB4(src, src_stride, src0, src1, src2, src3); @@ -82,10 +86,7 @@ static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, res3); ILVR_D2_UB(res1, res0, res3, res2, res0, res2); XORI_B2_128_UB(res0, res2); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, - dst6); - ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4); - AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2); + AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2); ST4x8_UB(res0, res2, dst, dst_stride); } @@ -105,8 +106,9 @@ static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, int32_t dst_stride, int8_t *filter, int32_t height) { int32_t loop_cnt; + int64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3; + v16u8 mask0, mask1, mask2, mask3, dst0 = { 0 }, dst1 = { 0 }; v8i16 filt, out0, out1, out2, out3; mask0 = LD_UB(&mc_filt_mask_arr[0]); @@ -127,10 +129,12 @@ static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); SAT_SH4_SH(out0, out1, out2, out3, 7); - CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst, + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); } @@ -309,8 +313,9 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, mask; - v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1; + v16u8 filt0, dst0 = { 0 }, vec0, vec1, res; v8u16 vec2, vec3, filt; mask = LD_SB(&mc_filt_mask_arr[16]); @@ -320,23 +325,24 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); LD_SB4(src, src_stride, src0, src1, src2, src3); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); SRARI_H2_UH(vec2, vec3, FILTER_BITS); - PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); - AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + res = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + res = (v16u8)__msa_aver_u_b(res, dst0); + ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride); } static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst0 = { 0 }, dst1 = { 0 }; v8u16 vec4, vec5, vec6, vec7, filt; mask = LD_SB(&mc_filt_mask_arr[16]); @@ -346,7 +352,10 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, @@ -354,13 +363,9 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, res3); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, - dst6); - AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, - res3); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); + ILVR_D2_UB(res1, res0, res3, res2, res0, res2); + AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2); + ST4x8_UB(res0, res2, dst, dst_stride); } static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src, @@ -378,8 +383,9 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + int64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, mask; - v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 filt0, dst0 = { 0 }, dst1 = { 0 }; v8u16 vec0, vec1, vec2, vec3, filt; mask = LD_SB(&mc_filt_mask_arr[0]); @@ -394,16 +400,18 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); } static void common_hz_2t_and_aver_dst_8x8mult_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter, int32_t height) { + int64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, mask; - v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 filt0, dst0 = { 0 }, dst1 = { 0 }; v8u16 vec0, vec1, vec2, vec3, filt; mask = LD_SB(&mc_filt_mask_arr[0]); @@ -419,11 +427,12 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa( DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); LD_SB4(src, src_stride, src0, src1, src2, src3); src += (4 * src_stride); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); + PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); @@ -431,9 +440,10 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa( DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); if (16 == height) { @@ -445,10 +455,11 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa( DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); LD_SB4(src, src_stride, src0, src1, src2, src3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); + PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); @@ -456,9 +467,10 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa( DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); } } @@ -633,9 +645,10 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; int8_t cnt, filt_hor[8]; assert(x_step_q4 == 16); @@ -668,8 +681,8 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, &filt_hor[3], h); break; default: - vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } else { @@ -695,8 +708,8 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, filt_hor, h); break; default: - vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c index 1cfa63201..5187cea21 100644 --- a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c +++ b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c @@ -16,8 +16,9 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { uint32_t loop_cnt; + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1; + v16u8 dst0 = { 0 }, mask0, mask1, mask2, mask3, res; v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4; @@ -59,7 +60,8 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa( XORI_B4_128_SB(src7, src8, src9, src10); src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); @@ -73,14 +75,12 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa( vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); SRARI_H2_SH(res0, res1, FILTER_BITS); SAT_SH2_SH(res0, res1, 7); - PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1); - XORI_B2_128_UB(tmp0, tmp1); - AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1); - ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); + res = PCKEV_XORI128_UB(res0, res1); + res = (v16u8)__msa_aver_u_b(res, dst0); + ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); hz_out5 = hz_out9; @@ -94,10 +94,11 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { uint32_t loop_cnt; + uint64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; - v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3; + v16u8 dst0 = { 0 }, dst1 = { 0 }, mask0, mask1, mask2, mask3; v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; @@ -144,7 +145,9 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa( XORI_B4_128_SB(src7, src8, src9, src10); src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); @@ -172,7 +175,7 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa( SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); - CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst, + CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); @@ -225,9 +228,10 @@ static void common_hv_8ht_8vt_and_aver_dst_64w_msa( static void common_hv_2ht_2vt_and_aver_dst_4x4_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, mask; v16u8 filt_hz, filt_vt, vec0, vec1; - v16u8 dst0, dst1, dst2, dst3, res0, res1; + v16u8 dst0 = { 0 }, out; v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt; mask = LD_SB(&mc_filt_mask_arr[16]); @@ -248,21 +252,22 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_msa( hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); - AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + out = __msa_aver_u_b(out, dst0); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); } static void common_hv_2ht_2vt_and_aver_dst_4x8_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; - v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1; + v16u8 dst0 = { 0 }, dst1 = { 0 }; v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; v8i16 filt; @@ -289,21 +294,18 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_msa( hz_out3, hz_out5, 8); hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, - dst6); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0, tmp1, tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2, - res3); - AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, - res3); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST4x8_UB(res0, res1, dst, dst_stride); } static void common_hv_2ht_2vt_and_aver_dst_4w_msa( @@ -321,8 +323,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4w_msa( static void common_hv_2ht_2vt_and_aver_dst_8x4_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert) { + uint64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, mask; - v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; + v16u8 filt_hz, filt_vt, dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3; v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; v8i16 filt; @@ -338,7 +341,9 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa( LD_SB5(src, src_stride, src0, src1, src2, src3, src4); src += (5 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); @@ -357,16 +362,16 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa( tmp3 = __msa_dotp_u_h(vec3, filt_vt); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, - dst_stride); + PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); } static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { uint32_t loop_cnt; + uint64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, mask; - v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3; + v16u8 filt_hz, filt_vt, vec0, dst0 = { 0 }, dst1 = { 0 }; v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; v8i16 filt; @@ -407,9 +412,10 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( tmp3 = __msa_dotp_u_h(vec0, filt_vt); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, - dst_stride); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -516,9 +522,10 @@ static void common_hv_2ht_2vt_and_aver_dst_64w_msa( void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; int8_t cnt, filt_hor[8], filt_ver[8]; assert(x_step_q4 == 16); @@ -560,14 +567,14 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, &filt_hor[3], &filt_ver[3], h); break; default: - vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } else if (((const int32_t *)filter_x)[0] == 0 || ((const int32_t *)filter_y)[0] == 0) { - vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } else { switch (w) { case 4: @@ -596,8 +603,8 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, filt_ver, h); break; default: - vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c index 146ce3b2f..ef8c90114 100644 --- a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c +++ b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c @@ -17,8 +17,9 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, int32_t dst_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16u8 dst0, dst1, dst2, dst3, out; + v16u8 dst0 = { 0 }, out; v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; v16i8 src10998, filt0, filt1, filt2, filt3; @@ -43,7 +44,8 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, LD_SB4(src, src_stride, src7, src8, src9, src10); src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, src87_r, src98_r, src109_r); ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); @@ -55,9 +57,6 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, SRARI_H2_SH(out10, out32, FILTER_BITS); SAT_SH2_SH(out10, out32, 7); out = PCKEV_XORI128_UB(out10, out32); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); - - dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0); out = __msa_aver_u_b(out, dst0); ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); @@ -75,8 +74,9 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, int32_t dst_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; + uint64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16u8 dst0, dst1, dst2, dst3; + v16u8 dst0 = { 0 }, dst1 = { 0 }; v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; v8i16 filt, out0, out1, out2, out3; @@ -98,7 +98,9 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, LD_SB4(src, src_stride, src7, src8, src9, src10); src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); XORI_B4_128_SB(src7, src8, src9, src10); ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, src87_r, src98_r, src109_r); @@ -112,7 +114,7 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, filt1, filt2, filt3); SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); SAT_SH4_SH(out0, out1, out2, out3, 7); - CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst, + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); @@ -246,8 +248,9 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4; - v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332; + v16u8 dst0 = { 0 }, out, filt0, src2110, src4332; v16i8 src10_r, src32_r, src21_r, src43_r; v8i16 filt; v8u16 tmp0, tmp1; @@ -261,9 +264,8 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, src4 = LD_SB(src); src += src_stride; - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1); - dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, src32_r, src43_r); ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); @@ -280,7 +282,8 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + uint32_t tp0, tp1, tp2, tp3; + v16u8 dst0 = { 0 }, dst1 = { 0 }; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r; v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; v16u8 src2110, src4332, src6554, src8776, filt0; @@ -294,10 +297,10 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, src += (8 * src_stride); src8 = LD_SB(src); - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2, - dst3); - ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, src32_r, src43_r); ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, @@ -309,9 +312,7 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332); - ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride); + ST4x8_UB(src2110, src4332, dst, dst_stride); } static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src, @@ -329,8 +330,9 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + int64_t tp0, tp1, tp2, tp3; v16u8 src0, src1, src2, src3, src4; - v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0; + v16u8 dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3, filt0; v8u16 tmp0, tmp1, tmp2, tmp3; v8i16 filt; @@ -339,22 +341,24 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, filt0 = (v16u8)__msa_splati_h(filt, 0); LD_UB5(src, src_stride, src0, src1, src2, src3, src4); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, - dst_stride); + PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); } static void common_vt_2t_and_aver_dst_8x8mult_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; + int64_t tp0, tp1, tp2, tp3; v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; - v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; + v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; v8u16 tmp0, tmp1, tmp2, tmp3; v8i16 filt; @@ -369,7 +373,12 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa( for (loop_cnt = (height >> 3); loop_cnt--;) { LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); src += (8 * src_stride); - LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + LD4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst2); + INSERT_D2_UB(tp2, tp3, dst3); ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, vec3); @@ -378,15 +387,13 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa( DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst, - dst_stride); + PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst, - dst_stride); + PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst2, dst3, dst, dst_stride); dst += (4 * dst_stride); src0 = src8; @@ -605,9 +612,10 @@ static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src, void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; int8_t cnt, filt_ver[8]; assert(y_step_q4 == 16); @@ -640,8 +648,8 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, &filt_ver[3], h); break; default: - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } else { @@ -668,8 +676,8 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, filt_ver, h); break; default: - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c index 9e8bf7b51..152dc2610 100644 --- a/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c +++ b/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c @@ -621,9 +621,10 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; int8_t cnt, filt_hor[8]; assert(x_step_q4 == 16); @@ -656,8 +657,8 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, &filt_hor[3], h); break; default: - vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } else { @@ -683,8 +684,8 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, filt_hor, h); break; default: - vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c index b16ec5788..d35a5a7a6 100644 --- a/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c +++ b/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c @@ -541,9 +541,11 @@ static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride, } void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int32_t x_step_q4, const int16_t *filter_y, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int32_t x_step_q4, int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; int8_t cnt, filt_hor[8], filt_ver[8]; assert(x_step_q4 == 16); @@ -585,14 +587,14 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, &filt_ver[3], (int32_t)h); break; default: - vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } else if (((const int32_t *)filter_x)[0] == 0 || ((const int32_t *)filter_y)[0] == 0) { - vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); } else { switch (w) { case 4: @@ -621,9 +623,605 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, (int32_t)h); break; default: - vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } } + +static void filter_horiz_w4_msa(const uint8_t *src_x, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *x_filter) { + uint64_t srcd0, srcd1, srcd2, srcd3; + uint32_t res; + v16u8 src0 = { 0 }, src1 = { 0 }, dst0; + v16i8 out0, out1; + v16i8 shf1 = { 0, 8, 16, 24, 4, 12, 20, 28, 1, 9, 17, 25, 5, 13, 21, 29 }; + v16i8 shf2 = shf1 + 2; + v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 }; + v16i8 filt_shf1 = filt_shf0 + 2; + v16i8 filt_shf2 = filt_shf0 + 4; + v16i8 filt_shf3 = filt_shf0 + 6; + v8i16 filt, src0_h, src1_h, src2_h, src3_h, filt0, filt1, filt2, filt3; + + LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src0); + INSERT_D2_UB(srcd2, srcd3, src1); + VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1); + XORI_B2_128_SB(out0, out1); + UNPCK_SB_SH(out0, src0_h, src1_h); + UNPCK_SB_SH(out1, src2_h, src3_h); + + filt = LD_SH(x_filter); + VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1); + VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3); + + src0_h *= filt0; + src0_h += src1_h * filt1; + src0_h += src2_h * filt2; + src0_h += src3_h * filt3; + + src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8); + + src0_h = __msa_adds_s_h(src0_h, src1_h); + src0_h = __msa_srari_h(src0_h, FILTER_BITS); + src0_h = __msa_sat_s_h(src0_h, 7); + dst0 = PCKEV_XORI128_UB(src0_h, src0_h); + res = __msa_copy_u_w((v4i32)dst0, 0); + SW(res, dst); +} + +static void filter_horiz_w8_msa(const uint8_t *src_x, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *x_filter) { + uint64_t srcd0, srcd1, srcd2, srcd3; + v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; + v16u8 tmp0, tmp1, tmp2, tmp3, dst0; + v16i8 out0, out1, out2, out3; + v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; + v16i8 shf2 = shf1 + 4; + v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h; + v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7; + + LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src0); + INSERT_D2_UB(srcd2, srcd3, src1); + LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src2); + INSERT_D2_UB(srcd2, srcd3, src3); + + filt = LD_SH(x_filter); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7); + + // transpose + VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1); + VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3); + ILVRL_W2_SB(tmp2, tmp0, out0, out1); + ILVRL_W2_SB(tmp3, tmp1, out2, out3); + + XORI_B4_128_SB(out0, out1, out2, out3); + UNPCK_SB_SH(out0, src0_h, src1_h); + UNPCK_SB_SH(out1, src2_h, src3_h); + UNPCK_SB_SH(out2, src4_h, src5_h); + UNPCK_SB_SH(out3, src6_h, src7_h); + + src0_h *= filt0; + src4_h *= filt4; + src0_h += src1_h * filt1; + src4_h += src5_h * filt5; + src0_h += src2_h * filt2; + src4_h += src6_h * filt6; + src0_h += src3_h * filt3; + src4_h += src7_h * filt7; + + src0_h = __msa_adds_s_h(src0_h, src4_h); + src0_h = __msa_srari_h(src0_h, FILTER_BITS); + src0_h = __msa_sat_s_h(src0_h, 7); + dst0 = PCKEV_XORI128_UB(src0_h, src0_h); + ST8x1_UB(dst0, dst); +} + +static void filter_horiz_w16_msa(const uint8_t *src_x, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *x_filter) { + uint64_t srcd0, srcd1, srcd2, srcd3; + v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; + v16u8 src4 = { 0 }, src5 = { 0 }, src6 = { 0 }, src7 = { 0 }; + v16u8 tmp0, tmp1, tmp2, tmp3, dst0; + v16i8 out0, out1, out2, out3, out4, out5, out6, out7; + v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; + v16i8 shf2 = shf1 + 4; + v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h; + v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7; + v8i16 dst0_h, dst1_h, dst2_h, dst3_h; + + LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src0); + INSERT_D2_UB(srcd2, srcd3, src1); + LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src2); + INSERT_D2_UB(srcd2, srcd3, src3); + LD4(src_x + 8 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src4); + INSERT_D2_UB(srcd2, srcd3, src5); + LD4(src_x + 12 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src6); + INSERT_D2_UB(srcd2, srcd3, src7); + + filt = LD_SH(x_filter); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7); + + // transpose + VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1); + VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3); + ILVRL_W2_SB(tmp2, tmp0, out0, out1); + ILVRL_W2_SB(tmp3, tmp1, out2, out3); + XORI_B4_128_SB(out0, out1, out2, out3); + + UNPCK_SB_SH(out0, src0_h, src1_h); + UNPCK_SB_SH(out1, src2_h, src3_h); + UNPCK_SB_SH(out2, src4_h, src5_h); + UNPCK_SB_SH(out3, src6_h, src7_h); + + VSHF_B2_UB(src4, src5, src4, src5, shf1, shf2, tmp0, tmp1); + VSHF_B2_UB(src6, src7, src6, src7, shf1, shf2, tmp2, tmp3); + ILVRL_W2_SB(tmp2, tmp0, out4, out5); + ILVRL_W2_SB(tmp3, tmp1, out6, out7); + XORI_B4_128_SB(out4, out5, out6, out7); + + dst0_h = src0_h * filt0; + dst1_h = src4_h * filt4; + dst0_h += src1_h * filt1; + dst1_h += src5_h * filt5; + dst0_h += src2_h * filt2; + dst1_h += src6_h * filt6; + dst0_h += src3_h * filt3; + dst1_h += src7_h * filt7; + + UNPCK_SB_SH(out4, src0_h, src1_h); + UNPCK_SB_SH(out5, src2_h, src3_h); + UNPCK_SB_SH(out6, src4_h, src5_h); + UNPCK_SB_SH(out7, src6_h, src7_h); + + dst2_h = src0_h * filt0; + dst3_h = src4_h * filt4; + dst2_h += src1_h * filt1; + dst3_h += src5_h * filt5; + dst2_h += src2_h * filt2; + dst3_h += src6_h * filt6; + dst2_h += src3_h * filt3; + dst3_h += src7_h * filt7; + + ADDS_SH2_SH(dst0_h, dst1_h, dst2_h, dst3_h, dst0_h, dst2_h); + SRARI_H2_SH(dst0_h, dst2_h, FILTER_BITS); + SAT_SH2_SH(dst0_h, dst2_h, 7); + dst0 = PCKEV_XORI128_UB(dst0_h, dst2_h); + ST_UB(dst0, dst); +} + +static void transpose4x4_to_dst(const uint8_t *src, uint8_t *dst, + ptrdiff_t dst_stride) { + v16u8 in0; + v16i8 out0 = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; + + in0 = LD_UB(src); + out0 = __msa_vshf_b(out0, (v16i8)in0, (v16i8)in0); + ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride); +} + +static void transpose8x8_to_dst(const uint8_t *src, uint8_t *dst, + ptrdiff_t dst_stride) { + v16u8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3; + v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; + v16i8 shf2 = shf1 + 4; + + LD_UB4(src, 16, in0, in1, in2, in3); + VSHF_B2_UB(in0, in1, in0, in1, shf1, shf2, tmp0, tmp1); + VSHF_B2_UB(in2, in3, in2, in3, shf1, shf2, tmp2, tmp3); + ILVRL_W2_UB(tmp2, tmp0, out0, out1); + ILVRL_W2_UB(tmp3, tmp1, out2, out3); + ST8x4_UB(out0, out1, dst, dst_stride); + ST8x4_UB(out2, out3, dst + 4 * dst_stride, dst_stride); +} + +static void transpose16x16_to_dst(const uint8_t *src, uint8_t *dst, + ptrdiff_t dst_stride) { + v16u8 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12; + v16u8 in13, in14, in15, out0, out1, out2, out3, out4, out5, out6, out7, out8; + v16u8 out9, out10, out11, out12, out13, out14, out15; + + LD_UB8(src, 16, in0, in1, in2, in3, in4, in5, in6, in7); + LD_UB8(src + 16 * 8, 16, in8, in9, in10, in11, in12, in13, in14, in15); + + TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, out0, out1, out2, out3, + out4, out5, out6, out7); + ST_UB8(out0, out1, out2, out3, out4, out5, out6, out7, dst, dst_stride); + dst += 8 * dst_stride; + + SLDI_B4_0_UB(in0, in1, in2, in3, in0, in1, in2, in3, 8); + SLDI_B4_0_UB(in4, in5, in6, in7, in4, in5, in6, in7, 8); + SLDI_B4_0_UB(in8, in9, in10, in11, in8, in9, in10, in11, 8); + SLDI_B4_0_UB(in12, in13, in14, in15, in12, in13, in14, in15, 8); + + TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, out8, out9, out10, out11, + out12, out13, out14, out15); + ST_UB8(out8, out9, out10, out11, out12, out13, out14, out15, dst, dst_stride); +} + +static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); + int y, z, i; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; y += 4) { + int x_q4 = x0_q4; + for (z = 0; z < 4; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w4_msa(src_x, src_stride, temp + (z * 4), x_filter); + } else { + for (i = 0; i < 4; ++i) { + temp[z * 4 + i] = src_x[i * src_stride + 3]; + } + } + + x_q4 += x_step_q4; + } + + transpose4x4_to_dst(temp, dst, dst_stride); + + src += src_stride * 4; + dst += dst_stride * 4; + } +} + +static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); + int y, z, i; + src -= SUBPEL_TAPS / 2 - 1; + + // This function processes 8x8 areas. The intermediate height is not always + // a multiple of 8, so force it to be a multiple of 8 here. + y = h + (8 - (h & 0x7)); + + do { + int x_q4 = x0_q4; + for (z = 0; z < 8; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w8_msa(src_x, src_stride, temp + (z * 8), x_filter); + } else { + for (i = 0; i < 8; ++i) { + temp[z * 8 + i] = src_x[3 + i * src_stride]; + } + } + + x_q4 += x_step_q4; + } + + transpose8x8_to_dst(temp, dst, dst_stride); + + src += src_stride * 8; + dst += dst_stride * 8; + } while (y -= 8); +} + +static void scaledconvolve_horiz_mul16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[16 * 16]); + int x, y, z, i; + + src -= SUBPEL_TAPS / 2 - 1; + + // This function processes 16x16 areas. The intermediate height is not always + // a multiple of 16, so force it to be a multiple of 8 here. + y = h + (16 - (h & 0xF)); + + do { + int x_q4 = x0_q4; + for (x = 0; x < w; x += 16) { + for (z = 0; z < 16; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w16_msa(src_x, src_stride, temp + (z * 16), x_filter); + } else { + for (i = 0; i < 16; ++i) { + temp[z * 16 + i] = src_x[3 + i * src_stride]; + } + } + + x_q4 += x_step_q4; + } + + transpose16x16_to_dst(temp, dst + x, dst_stride); + } + + src += src_stride * 16; + dst += dst_stride * 16; + } while (y -= 16); +} + +static void filter_vert_w4_msa(const uint8_t *src_y, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *y_filter) { + uint32_t srcw0, srcw1, srcw2, srcw3, srcw4, srcw5, srcw6, srcw7; + uint32_t res; + v16u8 src0 = { 0 }, src1 = { 0 }, dst0; + v16i8 out0, out1; + v16i8 shf1 = { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 }; + v16i8 shf2 = shf1 + 8; + v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 }; + v16i8 filt_shf1 = filt_shf0 + 2; + v16i8 filt_shf2 = filt_shf0 + 4; + v16i8 filt_shf3 = filt_shf0 + 6; + v8i16 filt, src0_h, src1_h, src2_h, src3_h; + v8i16 filt0, filt1, filt2, filt3; + + LW4(src_y, src_pitch, srcw0, srcw1, srcw2, srcw3); + LW4(src_y + 4 * src_pitch, src_pitch, srcw4, srcw5, srcw6, srcw7); + INSERT_W4_UB(srcw0, srcw1, srcw2, srcw3, src0); + INSERT_W4_UB(srcw4, srcw5, srcw6, srcw7, src1); + VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1); + XORI_B2_128_SB(out0, out1); + UNPCK_SB_SH(out0, src0_h, src1_h); + UNPCK_SB_SH(out1, src2_h, src3_h); + + filt = LD_SH(y_filter); + VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1); + VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3); + + src0_h *= filt0; + src0_h += src1_h * filt1; + src0_h += src2_h * filt2; + src0_h += src3_h * filt3; + + src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8); + + src0_h = __msa_adds_s_h(src0_h, src1_h); + src0_h = __msa_srari_h(src0_h, FILTER_BITS); + src0_h = __msa_sat_s_h(src0_h, 7); + dst0 = PCKEV_XORI128_UB(src0_h, src0_h); + res = __msa_copy_u_w((v4i32)dst0, 0); + SW(res, dst); +} + +static void filter_vert_w8_msa(const uint8_t *src_y, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *y_filter) { + uint64_t srcd0, srcd1, srcd2, srcd3; + v16u8 dst0; + v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; + v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h; + v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7; + + LD4(src_y, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_SB(srcd0, srcd1, src0); + INSERT_D2_SB(srcd2, srcd3, src1); + LD4(src_y + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_SB(srcd0, srcd1, src2); + INSERT_D2_SB(srcd2, srcd3, src3); + + filt = LD_SH(y_filter); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7); + + XORI_B4_128_SB(src0, src1, src2, src3); + UNPCK_SB_SH(src0, src0_h, src1_h); + UNPCK_SB_SH(src1, src2_h, src3_h); + UNPCK_SB_SH(src2, src4_h, src5_h); + UNPCK_SB_SH(src3, src6_h, src7_h); + + src0_h *= filt0; + src4_h *= filt4; + src0_h += src1_h * filt1; + src4_h += src5_h * filt5; + src0_h += src2_h * filt2; + src4_h += src6_h * filt6; + src0_h += src3_h * filt3; + src4_h += src7_h * filt7; + + src0_h = __msa_adds_s_h(src0_h, src4_h); + src0_h = __msa_srari_h(src0_h, FILTER_BITS); + src0_h = __msa_sat_s_h(src0_h, 7); + dst0 = PCKEV_XORI128_UB(src0_h, src0_h); + ST8x1_UB(dst0, dst); +} + +static void filter_vert_mul_w16_msa(const uint8_t *src_y, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *y_filter, + int w) { + int x; + v16u8 dst0; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h; + v8i16 src8_h, src9_h, src10_h, src11_h, src12_h, src13_h, src14_h, src15_h; + v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7; + + filt = LD_SH(y_filter); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7); + + for (x = 0; x < w; x += 16) { + LD_SB8(src_y, src_pitch, src0, src1, src2, src3, src4, src5, src6, src7); + src_y += 16; + + XORI_B4_128_SB(src0, src1, src2, src3); + XORI_B4_128_SB(src4, src5, src6, src7); + UNPCK_SB_SH(src0, src0_h, src1_h); + UNPCK_SB_SH(src1, src2_h, src3_h); + UNPCK_SB_SH(src2, src4_h, src5_h); + UNPCK_SB_SH(src3, src6_h, src7_h); + UNPCK_SB_SH(src4, src8_h, src9_h); + UNPCK_SB_SH(src5, src10_h, src11_h); + UNPCK_SB_SH(src6, src12_h, src13_h); + UNPCK_SB_SH(src7, src14_h, src15_h); + + src0_h *= filt0; + src1_h *= filt0; + src8_h *= filt4; + src9_h *= filt4; + src0_h += src2_h * filt1; + src1_h += src3_h * filt1; + src8_h += src10_h * filt5; + src9_h += src11_h * filt5; + src0_h += src4_h * filt2; + src1_h += src5_h * filt2; + src8_h += src12_h * filt6; + src9_h += src13_h * filt6; + src0_h += src6_h * filt3; + src1_h += src7_h * filt3; + src8_h += src14_h * filt7; + src9_h += src15_h * filt7; + + ADDS_SH2_SH(src0_h, src8_h, src1_h, src9_h, src0_h, src1_h); + SRARI_H2_SH(src0_h, src1_h, FILTER_BITS); + SAT_SH2_SH(src0_h, src1_h, 7); + dst0 = PCKEV_XORI128_UB(src0_h, src1_h); + ST_UB(dst0, dst); + dst += 16; + } +} + +static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + + if (y_q4 & SUBPEL_MASK) { + filter_vert_w4_msa(src_y, src_stride, &dst[y * dst_stride], y_filter); + } else { + uint32_t srcd = LW(src_y + 3 * src_stride); + SW(srcd, dst + y * dst_stride); + } + + y_q4 += y_step_q4; + } +} + +static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + + if (y_q4 & SUBPEL_MASK) { + filter_vert_w8_msa(src_y, src_stride, &dst[y * dst_stride], y_filter); + } else { + uint64_t srcd = LD(src_y + 3 * src_stride); + SD(srcd, dst + y * dst_stride); + } + + y_q4 += y_step_q4; + } +} + +static void scaledconvolve_vert_mul16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + int y_q4 = y0_q4; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + + if (y_q4 & SUBPEL_MASK) { + filter_vert_mul_w16_msa(src_y, src_stride, &dst[y * dst_stride], y_filter, + w); + } else { + for (x = 0; x < w; ++x) { + dst[x + y * dst_stride] = src_y[x + 3 * src_stride]; + } + } + + y_q4 += y_step_q4; + } +} + +void vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // --Require an additional 8 rows for the horiz_w8 transpose tail. + DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); + + if ((0 == x0_q4) && (16 == x_step_q4) && (0 == y0_q4) && (16 == y_step_q4)) { + vpx_convolve_copy_msa(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + } else { + if (w >= 16) { + scaledconvolve_horiz_mul16(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, + w, intermediate_height); + } else if (w == 8) { + scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, + intermediate_height); + } else { + scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, + intermediate_height); + } + + if (w >= 16) { + scaledconvolve_vert_mul16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } else if (w == 8) { + scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, h); + } else { + scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, h); + } + } +} diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c index 410682271..13fce0077 100644 --- a/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c +++ b/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c @@ -628,9 +628,10 @@ static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; int8_t cnt, filt_ver[8]; assert(y_step_q4 == 16); @@ -663,8 +664,8 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, &filt_ver[3], h); break; default: - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } else { @@ -690,8 +691,8 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, filt_ver, h); break; default: - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c index 45399bad8..ce649935d 100644 --- a/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c +++ b/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/macros_msa.h" static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, @@ -188,13 +189,14 @@ static void avg_width64_msa(const uint8_t *src, int32_t src_stride, void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int32_t filter_x_stride, - const int16_t *filter_y, int32_t filter_y_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; switch (w) { case 4: { diff --git a/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c index c3d87a4ab..c2ab33a2f 100644 --- a/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c +++ b/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c @@ -9,6 +9,7 @@ */ #include <string.h> +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/macros_msa.h" static void copy_width8_msa(const uint8_t *src, int32_t src_stride, @@ -198,13 +199,14 @@ static void copy_width64_msa(const uint8_t *src, int32_t src_stride, void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int32_t filter_x_stride, - const int16_t *filter_y, int32_t filter_y_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; switch (w) { case 4: { diff --git a/libvpx/vpx_dsp/mips/vpx_convolve_msa.h b/libvpx/vpx_dsp/mips/vpx_convolve_msa.h index f75679521..d53244596 100644 --- a/libvpx/vpx_dsp/mips/vpx_convolve_msa.h +++ b/libvpx/vpx_dsp/mips/vpx_convolve_msa.h @@ -110,14 +110,13 @@ extern const uint8_t mc_filt_mask_arr[16 * 3]; ST_UB(tmp_m, (pdst)); \ } -#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, \ - stride) \ - { \ - v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - \ - PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \ - PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ - AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ - ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ +#define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \ + { \ + v16u8 tmp0_m, tmp1_m; \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + \ + PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m); \ + AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ } #endif /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */ diff --git a/libvpx/vpx_dsp/ppc/hadamard_vsx.c b/libvpx/vpx_dsp/ppc/hadamard_vsx.c index 435e3eb5b..e279b3047 100644 --- a/libvpx/vpx_dsp/ppc/hadamard_vsx.c +++ b/libvpx/vpx_dsp/ppc/hadamard_vsx.c @@ -42,7 +42,7 @@ static void vpx_hadamard_s16_8x8_one_pass(int16x8_t v[8]) { v[7] = vec_add(c1, c5); } -void vpx_hadamard_8x8_vsx(const int16_t *src_diff, int src_stride, +void vpx_hadamard_8x8_vsx(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int16x8_t v[8]; @@ -71,7 +71,7 @@ void vpx_hadamard_8x8_vsx(const int16_t *src_diff, int src_stride, store_tran_low(v[7], 0, coeff + 56); } -void vpx_hadamard_16x16_vsx(const int16_t *src_diff, int src_stride, +void vpx_hadamard_16x16_vsx(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int i; const uint16x8_t ones = vec_splat_u16(1); diff --git a/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c b/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c new file mode 100644 index 000000000..d43a9fd18 --- /dev/null +++ b/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c @@ -0,0 +1,1063 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <math.h> +#include <stdlib.h> +#include <string.h> + +#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h" +#include "vpx_dsp/ppc/types_vsx.h" + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/inv_txfm.h" + +static int16x8_t cospi1_v = { 16364, 16364, 16364, 16364, + 16364, 16364, 16364, 16364 }; +static int16x8_t cospi2_v = { 16305, 16305, 16305, 16305, + 16305, 16305, 16305, 16305 }; +static int16x8_t cospi3_v = { 16207, 16207, 16207, 16207, + 16207, 16207, 16207, 16207 }; +static int16x8_t cospi4_v = { 16069, 16069, 16069, 16069, + 16069, 16069, 16069, 16069 }; +static int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069, + -16069, -16069, -16069, -16069 }; +static int16x8_t cospi5_v = { 15893, 15893, 15893, 15893, + 15893, 15893, 15893, 15893 }; +static int16x8_t cospi6_v = { 15679, 15679, 15679, 15679, + 15679, 15679, 15679, 15679 }; +static int16x8_t cospi7_v = { 15426, 15426, 15426, 15426, + 15426, 15426, 15426, 15426 }; +static int16x8_t cospi8_v = { 15137, 15137, 15137, 15137, + 15137, 15137, 15137, 15137 }; +static int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137, + -15137, -15137, -15137, -15137 }; +static int16x8_t cospi9_v = { 14811, 14811, 14811, 14811, + 14811, 14811, 14811, 14811 }; +static int16x8_t cospi10_v = { 14449, 14449, 14449, 14449, + 14449, 14449, 14449, 14449 }; +static int16x8_t cospi11_v = { 14053, 14053, 14053, 14053, + 14053, 14053, 14053, 14053 }; +static int16x8_t cospi12_v = { 13623, 13623, 13623, 13623, + 13623, 13623, 13623, 13623 }; +static int16x8_t cospi13_v = { 13160, 13160, 13160, 13160, + 13160, 13160, 13160, 13160 }; +static int16x8_t cospi14_v = { 12665, 12665, 12665, 12665, + 12665, 12665, 12665, 12665 }; +static int16x8_t cospi15_v = { 12140, 12140, 12140, 12140, + 12140, 12140, 12140, 12140 }; +static int16x8_t cospi16_v = { 11585, 11585, 11585, 11585, + 11585, 11585, 11585, 11585 }; +static int16x8_t cospi17_v = { 11003, 11003, 11003, 11003, + 11003, 11003, 11003, 11003 }; +static int16x8_t cospi18_v = { 10394, 10394, 10394, 10394, + 10394, 10394, 10394, 10394 }; +static int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, 9760, 9760, 9760, 9760 }; +static int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, 9102, 9102, 9102, 9102 }; +static int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102, + -9102, -9102, -9102, -9102 }; +static int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, 8423, 8423, 8423, 8423 }; +static int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, 7723, 7723, 7723, 7723 }; +static int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, 7005, 7005, 7005, 7005 }; +static int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, 6270, 6270, 6270, 6270 }; +static int16x8_t cospi24_mv = { -6270, -6270, -6270, -6270, + -6270, -6270, -6270, -6270 }; +static int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, 5520, 5520, 5520, 5520 }; +static int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, 4756, 4756, 4756, 4756 }; +static int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, 3981, 3981, 3981, 3981 }; +static int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, 3196, 3196, 3196, 3196 }; +static int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, 2404, 2404, 2404, 2404 }; +static int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, 1606, 1606, 1606, 1606 }; +static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 }; + +#define ROUND_SHIFT_INIT \ + const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \ + const uint32x4_t shift14 = vec_splat_u32(14); + +#define DCT_CONST_ROUND_SHIFT(vec) vec = vec_sra(vec_add(vec, shift), shift14); + +#define PIXEL_ADD_INIT \ + int16x8_t add8 = vec_splat_s16(8); \ + uint16x8_t shift4 = vec_splat_u16(4); + +#define PIXEL_ADD4(out, in) out = vec_sra(vec_add(in, add8), shift4); + +#define IDCT4(in0, in1, out0, out1) \ + t0 = vec_add(in0, in1); \ + t1 = vec_sub(in0, in1); \ + tmp16_0 = vec_mergeh(t0, t1); \ + temp1 = vec_sra(vec_add(vec_mule(tmp16_0, cospi16_v), shift), shift14); \ + temp2 = vec_sra(vec_add(vec_mulo(tmp16_0, cospi16_v), shift), shift14); \ + \ + tmp16_0 = vec_mergel(in0, in1); \ + temp3 = vec_sub(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \ + DCT_CONST_ROUND_SHIFT(temp3); \ + temp4 = vec_add(vec_mule(tmp16_0, cospi8_v), vec_mulo(tmp16_0, cospi24_v)); \ + DCT_CONST_ROUND_SHIFT(temp4); \ + \ + step0 = vec_packs(temp1, temp2); \ + step1 = vec_packs(temp4, temp3); \ + out0 = vec_add(step0, step1); \ + out1 = vec_sub(step0, step1); \ + out1 = vec_perm(out1, out1, mask0); + +void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int32x4_t temp1, temp2, temp3, temp4; + int16x8_t step0, step1, tmp16_0, tmp16_1, t_out0, t_out1; + uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 }; + uint8x16_t mask1 = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 }; + int16x8_t v0 = load_tran_low(0, input); + int16x8_t v1 = load_tran_low(8 * sizeof(*input), input); + int16x8_t t0 = vec_mergeh(v0, v1); + int16x8_t t1 = vec_mergel(v0, v1); + + uint8x16_t dest0 = vec_vsx_ld(0, dest); + uint8x16_t dest1 = vec_vsx_ld(stride, dest); + uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); + uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest); + uint8x16_t zerov = vec_splat_u8(0); + int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov); + int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov); + int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov); + int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov); + uint8x16_t output_v; + uint8_t tmp_dest[16]; + ROUND_SHIFT_INIT + PIXEL_ADD_INIT; + + v0 = vec_mergeh(t0, t1); + v1 = vec_mergel(t0, t1); + + IDCT4(v0, v1, t_out0, t_out1); + // transpose + t0 = vec_mergeh(t_out0, t_out1); + t1 = vec_mergel(t_out0, t_out1); + v0 = vec_mergeh(t0, t1); + v1 = vec_mergel(t0, t1); + IDCT4(v0, v1, t_out0, t_out1); + + PIXEL_ADD4(v0, t_out0); + PIXEL_ADD4(v1, t_out1); + tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0); + tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1); + output_v = vec_packsu(tmp16_0, tmp16_1); + + vec_vsx_st(output_v, 0, tmp_dest); + for (int i = 0; i < 4; i++) + for (int j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i]; +} + +#define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + out0 = vec_mergeh(in0, in1); \ + out1 = vec_mergel(in0, in1); \ + out2 = vec_mergeh(in2, in3); \ + out3 = vec_mergel(in2, in3); \ + out4 = vec_mergeh(in4, in5); \ + out5 = vec_mergel(in4, in5); \ + out6 = vec_mergeh(in6, in7); \ + out7 = vec_mergel(in6, in7); \ + in0 = (int16x8_t)vec_mergeh((int32x4_t)out0, (int32x4_t)out2); \ + in1 = (int16x8_t)vec_mergel((int32x4_t)out0, (int32x4_t)out2); \ + in2 = (int16x8_t)vec_mergeh((int32x4_t)out1, (int32x4_t)out3); \ + in3 = (int16x8_t)vec_mergel((int32x4_t)out1, (int32x4_t)out3); \ + in4 = (int16x8_t)vec_mergeh((int32x4_t)out4, (int32x4_t)out6); \ + in5 = (int16x8_t)vec_mergel((int32x4_t)out4, (int32x4_t)out6); \ + in6 = (int16x8_t)vec_mergeh((int32x4_t)out5, (int32x4_t)out7); \ + in7 = (int16x8_t)vec_mergel((int32x4_t)out5, (int32x4_t)out7); \ + out0 = vec_perm(in0, in4, tr8_mask0); \ + out1 = vec_perm(in0, in4, tr8_mask1); \ + out2 = vec_perm(in1, in5, tr8_mask0); \ + out3 = vec_perm(in1, in5, tr8_mask1); \ + out4 = vec_perm(in2, in6, tr8_mask0); \ + out5 = vec_perm(in2, in6, tr8_mask1); \ + out6 = vec_perm(in3, in7, tr8_mask0); \ + out7 = vec_perm(in3, in7, tr8_mask1); + +/* for the: temp1 = step[x] * cospi_q - step[y] * cospi_z + * temp2 = step[x] * cospi_z + step[y] * cospi_q */ +#define STEP8_0(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1) \ + tmp16_0 = vec_mergeh(inpt0, inpt1); \ + tmp16_1 = vec_mergel(inpt0, inpt1); \ + temp10 = vec_sub(vec_mule(tmp16_0, cospi0), vec_mulo(tmp16_0, cospi1)); \ + temp11 = vec_sub(vec_mule(tmp16_1, cospi0), vec_mulo(tmp16_1, cospi1)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt0 = vec_packs(temp10, temp11); \ + temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \ + temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt1 = vec_packs(temp10, temp11); + +#define STEP8_1(inpt0, inpt1, outpt0, outpt1, cospi) \ + tmp16_2 = vec_sub(inpt0, inpt1); \ + tmp16_3 = vec_add(inpt0, inpt1); \ + tmp16_0 = vec_mergeh(tmp16_2, tmp16_3); \ + tmp16_1 = vec_mergel(tmp16_2, tmp16_3); \ + temp10 = vec_mule(tmp16_0, cospi); \ + temp11 = vec_mule(tmp16_1, cospi); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt0 = vec_packs(temp10, temp11); \ + temp10 = vec_mulo(tmp16_0, cospi); \ + temp11 = vec_mulo(tmp16_1, cospi); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt1 = vec_packs(temp10, temp11); + +#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7) \ + /* stage 1 */ \ + step0 = in0; \ + step2 = in4; \ + step1 = in2; \ + step3 = in6; \ + \ + STEP8_0(in1, in7, step4, step7, cospi28_v, cospi4_v); \ + STEP8_0(in5, in3, step5, step6, cospi12_v, cospi20_v); \ + \ + /* stage 2 */ \ + STEP8_1(step0, step2, in1, in0, cospi16_v); \ + STEP8_0(step1, step3, in2, in3, cospi24_v, cospi8_v); \ + in4 = vec_add(step4, step5); \ + in5 = vec_sub(step4, step5); \ + in6 = vec_sub(step7, step6); \ + in7 = vec_add(step6, step7); \ + \ + /* stage 3 */ \ + step0 = vec_add(in0, in3); \ + step1 = vec_add(in1, in2); \ + step2 = vec_sub(in1, in2); \ + step3 = vec_sub(in0, in3); \ + step4 = in4; \ + STEP8_1(in6, in5, step5, step6, cospi16_v); \ + step7 = in7; \ + \ + /* stage 4 */ \ + in0 = vec_add(step0, step7); \ + in1 = vec_add(step1, step6); \ + in2 = vec_add(step2, step5); \ + in3 = vec_add(step3, step4); \ + in4 = vec_sub(step3, step4); \ + in5 = vec_sub(step2, step5); \ + in6 = vec_sub(step1, step6); \ + in7 = vec_sub(step0, step7); + +#define PIXEL_ADD(in, out, add, shiftx) \ + out = vec_add(vec_sra(vec_add(in, add), shiftx), out); + +static uint8x16_t tr8_mask0 = { + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 +}; +static uint8x16_t tr8_mask1 = { + 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F +}; +void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int32x4_t temp10, temp11; + int16x8_t step0, step1, step2, step3, step4, step5, step6, step7; + int16x8_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp16_0, tmp16_1, + tmp16_2, tmp16_3; + int16x8_t src0 = load_tran_low(0, input); + int16x8_t src1 = load_tran_low(8 * sizeof(*input), input); + int16x8_t src2 = load_tran_low(16 * sizeof(*input), input); + int16x8_t src3 = load_tran_low(24 * sizeof(*input), input); + int16x8_t src4 = load_tran_low(32 * sizeof(*input), input); + int16x8_t src5 = load_tran_low(40 * sizeof(*input), input); + int16x8_t src6 = load_tran_low(48 * sizeof(*input), input); + int16x8_t src7 = load_tran_low(56 * sizeof(*input), input); + uint8x16_t dest0 = vec_vsx_ld(0, dest); + uint8x16_t dest1 = vec_vsx_ld(stride, dest); + uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); + uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest); + uint8x16_t dest4 = vec_vsx_ld(4 * stride, dest); + uint8x16_t dest5 = vec_vsx_ld(5 * stride, dest); + uint8x16_t dest6 = vec_vsx_ld(6 * stride, dest); + uint8x16_t dest7 = vec_vsx_ld(7 * stride, dest); + uint8x16_t zerov = vec_splat_u8(0); + int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov); + int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov); + int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov); + int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov); + int16x8_t d_u4 = (int16x8_t)vec_mergeh(dest4, zerov); + int16x8_t d_u5 = (int16x8_t)vec_mergeh(dest5, zerov); + int16x8_t d_u6 = (int16x8_t)vec_mergeh(dest6, zerov); + int16x8_t d_u7 = (int16x8_t)vec_mergeh(dest7, zerov); + int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(1)); + uint16x8_t shift5 = vec_splat_u16(5); + uint8x16_t output0, output1, output2, output3; + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(src0, src1, src2, src3, src4, src5, src6, src7, tmp0, tmp1, tmp2, + tmp3, tmp4, tmp5, tmp6, tmp7); + + IDCT8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + TRANSPOSE8x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, src2, + src3, src4, src5, src6, src7); + IDCT8(src0, src1, src2, src3, src4, src5, src6, src7); + PIXEL_ADD(src0, d_u0, add, shift5); + PIXEL_ADD(src1, d_u1, add, shift5); + PIXEL_ADD(src2, d_u2, add, shift5); + PIXEL_ADD(src3, d_u3, add, shift5); + PIXEL_ADD(src4, d_u4, add, shift5); + PIXEL_ADD(src5, d_u5, add, shift5); + PIXEL_ADD(src6, d_u6, add, shift5); + PIXEL_ADD(src7, d_u7, add, shift5); + output0 = vec_packsu(d_u0, d_u1); + output1 = vec_packsu(d_u2, d_u3); + output2 = vec_packsu(d_u4, d_u5); + output3 = vec_packsu(d_u6, d_u7); + + vec_vsx_st(xxpermdi(output0, dest0, 1), 0, dest); + vec_vsx_st(xxpermdi(output0, dest1, 3), stride, dest); + vec_vsx_st(xxpermdi(output1, dest2, 1), 2 * stride, dest); + vec_vsx_st(xxpermdi(output1, dest3, 3), 3 * stride, dest); + vec_vsx_st(xxpermdi(output2, dest4, 1), 4 * stride, dest); + vec_vsx_st(xxpermdi(output2, dest5, 3), 5 * stride, dest); + vec_vsx_st(xxpermdi(output3, dest6, 1), 6 * stride, dest); + vec_vsx_st(xxpermdi(output3, dest7, 3), 7 * stride, dest); +} + +#define LOAD_INPUT16(load, source, offset, step, in0, in1, in2, in3, in4, in5, \ + in6, in7, in8, in9, inA, inB, inC, inD, inE, inF) \ + in0 = load(offset, source); \ + in1 = load((step) + (offset), source); \ + in2 = load(2 * (step) + (offset), source); \ + in3 = load(3 * (step) + (offset), source); \ + in4 = load(4 * (step) + (offset), source); \ + in5 = load(5 * (step) + (offset), source); \ + in6 = load(6 * (step) + (offset), source); \ + in7 = load(7 * (step) + (offset), source); \ + in8 = load(8 * (step) + (offset), source); \ + in9 = load(9 * (step) + (offset), source); \ + inA = load(10 * (step) + (offset), source); \ + inB = load(11 * (step) + (offset), source); \ + inC = load(12 * (step) + (offset), source); \ + inD = load(13 * (step) + (offset), source); \ + inE = load(14 * (step) + (offset), source); \ + inF = load(15 * (step) + (offset), source); + +#define STEP16_1(inpt0, inpt1, outpt0, outpt1, cospi) \ + tmp16_0 = vec_mergeh(inpt0, inpt1); \ + tmp16_1 = vec_mergel(inpt0, inpt1); \ + temp10 = vec_mule(tmp16_0, cospi); \ + temp11 = vec_mule(tmp16_1, cospi); \ + temp20 = vec_mulo(tmp16_0, cospi); \ + temp21 = vec_mulo(tmp16_1, cospi); \ + temp30 = vec_sub(temp10, temp20); \ + temp10 = vec_add(temp10, temp20); \ + temp20 = vec_sub(temp11, temp21); \ + temp21 = vec_add(temp11, temp21); \ + DCT_CONST_ROUND_SHIFT(temp30); \ + DCT_CONST_ROUND_SHIFT(temp20); \ + outpt0 = vec_packs(temp30, temp20); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp21); \ + outpt1 = vec_packs(temp10, temp21); + +#define IDCT16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, inA, inB, \ + inC, inD, inE, inF, out0, out1, out2, out3, out4, out5, out6, \ + out7, out8, out9, outA, outB, outC, outD, outE, outF) \ + /* stage 1 */ \ + /* out0 = in0; */ \ + out1 = in8; \ + out2 = in4; \ + out3 = inC; \ + out4 = in2; \ + out5 = inA; \ + out6 = in6; \ + out7 = inE; \ + out8 = in1; \ + out9 = in9; \ + outA = in5; \ + outB = inD; \ + outC = in3; \ + outD = inB; \ + outE = in7; \ + outF = inF; \ + \ + /* stage 2 */ \ + /* in0 = out0; */ \ + in1 = out1; \ + in2 = out2; \ + in3 = out3; \ + in4 = out4; \ + in5 = out5; \ + in6 = out6; \ + in7 = out7; \ + \ + STEP8_0(out8, outF, in8, inF, cospi30_v, cospi2_v); \ + STEP8_0(out9, outE, in9, inE, cospi14_v, cospi18_v); \ + STEP8_0(outA, outD, inA, inD, cospi22_v, cospi10_v); \ + STEP8_0(outB, outC, inB, inC, cospi6_v, cospi26_v); \ + \ + /* stage 3 */ \ + out0 = in0; \ + out1 = in1; \ + out2 = in2; \ + out3 = in3; \ + \ + STEP8_0(in4, in7, out4, out7, cospi28_v, cospi4_v); \ + STEP8_0(in5, in6, out5, out6, cospi12_v, cospi20_v); \ + \ + out8 = vec_add(in8, in9); \ + out9 = vec_sub(in8, in9); \ + outA = vec_sub(inB, inA); \ + outB = vec_add(inA, inB); \ + outC = vec_add(inC, inD); \ + outD = vec_sub(inC, inD); \ + outE = vec_sub(inF, inE); \ + outF = vec_add(inE, inF); \ + \ + /* stage 4 */ \ + STEP16_1(out0, out1, in1, in0, cospi16_v); \ + STEP8_0(out2, out3, in2, in3, cospi24_v, cospi8_v); \ + in4 = vec_add(out4, out5); \ + in5 = vec_sub(out4, out5); \ + in6 = vec_sub(out7, out6); \ + in7 = vec_add(out6, out7); \ + \ + in8 = out8; \ + inF = outF; \ + tmp16_0 = vec_mergeh(out9, outE); \ + tmp16_1 = vec_mergel(out9, outE); \ + temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \ + temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + in9 = vec_packs(temp10, temp11); \ + temp10 = vec_add(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \ + temp11 = vec_add(vec_mule(tmp16_1, cospi24_v), vec_mulo(tmp16_1, cospi8_v)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + inE = vec_packs(temp10, temp11); \ + \ + tmp16_0 = vec_mergeh(outA, outD); \ + tmp16_1 = vec_mergel(outA, outD); \ + temp10 = \ + vec_sub(vec_mule(tmp16_0, cospi24_mv), vec_mulo(tmp16_0, cospi8_v)); \ + temp11 = \ + vec_sub(vec_mule(tmp16_1, cospi24_mv), vec_mulo(tmp16_1, cospi8_v)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + inA = vec_packs(temp10, temp11); \ + temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \ + temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + inD = vec_packs(temp10, temp11); \ + \ + inB = outB; \ + inC = outC; \ + \ + /* stage 5 */ \ + out0 = vec_add(in0, in3); \ + out1 = vec_add(in1, in2); \ + out2 = vec_sub(in1, in2); \ + out3 = vec_sub(in0, in3); \ + out4 = in4; \ + STEP16_1(in6, in5, out5, out6, cospi16_v); \ + out7 = in7; \ + \ + out8 = vec_add(in8, inB); \ + out9 = vec_add(in9, inA); \ + outA = vec_sub(in9, inA); \ + outB = vec_sub(in8, inB); \ + outC = vec_sub(inF, inC); \ + outD = vec_sub(inE, inD); \ + outE = vec_add(inD, inE); \ + outF = vec_add(inC, inF); \ + \ + /* stage 6 */ \ + in0 = vec_add(out0, out7); \ + in1 = vec_add(out1, out6); \ + in2 = vec_add(out2, out5); \ + in3 = vec_add(out3, out4); \ + in4 = vec_sub(out3, out4); \ + in5 = vec_sub(out2, out5); \ + in6 = vec_sub(out1, out6); \ + in7 = vec_sub(out0, out7); \ + in8 = out8; \ + in9 = out9; \ + STEP16_1(outD, outA, inA, inD, cospi16_v); \ + STEP16_1(outC, outB, inB, inC, cospi16_v); \ + inE = outE; \ + inF = outF; \ + \ + /* stage 7 */ \ + out0 = vec_add(in0, inF); \ + out1 = vec_add(in1, inE); \ + out2 = vec_add(in2, inD); \ + out3 = vec_add(in3, inC); \ + out4 = vec_add(in4, inB); \ + out5 = vec_add(in5, inA); \ + out6 = vec_add(in6, in9); \ + out7 = vec_add(in7, in8); \ + out8 = vec_sub(in7, in8); \ + out9 = vec_sub(in6, in9); \ + outA = vec_sub(in5, inA); \ + outB = vec_sub(in4, inB); \ + outC = vec_sub(in3, inC); \ + outD = vec_sub(in2, inD); \ + outE = vec_sub(in1, inE); \ + outF = vec_sub(in0, inF); + +#define PIXEL_ADD_STORE16(in0, in1, dst, offset) \ + d_uh = (int16x8_t)vec_mergeh(dst, zerov); \ + d_ul = (int16x8_t)vec_mergel(dst, zerov); \ + PIXEL_ADD(in0, d_uh, add, shift6); \ + PIXEL_ADD(in1, d_ul, add, shift6); \ + vec_vsx_st(vec_packsu(d_uh, d_ul), offset, dest); + +void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int32x4_t temp10, temp11, temp20, temp21, temp30; + int16x8_t src00, src01, src02, src03, src04, src05, src06, src07, src10, + src11, src12, src13, src14, src15, src16, src17; + int16x8_t src20, src21, src22, src23, src24, src25, src26, src27, src30, + src31, src32, src33, src34, src35, src36, src37; + int16x8_t tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10, + tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp16_0, tmp16_1; + int16x8_t tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30, + tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37; + uint8x16_t dest0, dest1, dest2, dest3, dest4, dest5, dest6, dest7, dest8, + dest9, destA, destB, destC, destD, destE, destF; + int16x8_t d_uh, d_ul; + int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2)); + uint16x8_t shift6 = vec_splat_u16(6); + uint8x16_t zerov = vec_splat_u8(0); + ROUND_SHIFT_INIT; + + // transform rows + // load and transform the upper half of 16x16 matrix + LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), src00, src10, src01, + src11, src02, src12, src03, src13, src04, src14, src05, src15, + src06, src16, src07, src17); + TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00, + tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07); + TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10, + tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17); + IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10, tmp11, + tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, src00, src01, src02, src03, + src04, src05, src06, src07, src10, src11, src12, src13, src14, src15, + src16, src17); + TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00, + tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07); + TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10, + tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17); + + // load and transform the lower half of 16x16 matrix + LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input), + 8 * sizeof(*input), src20, src30, src21, src31, src22, src32, + src23, src33, src24, src34, src25, src35, src26, src36, src27, + src37); + TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20, + tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27); + TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30, + tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37); + IDCT16(tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30, tmp31, + tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src20, src21, src22, src23, + src24, src25, src26, src27, src30, src31, src32, src33, src34, src35, + src36, src37); + TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20, + tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27); + TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30, + tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37); + + // transform columns + // left half first + IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp20, tmp21, + tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, src00, src01, src02, src03, + src04, src05, src06, src07, src20, src21, src22, src23, src24, src25, + src26, src27); + // right half + IDCT16(tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp30, tmp31, + tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src10, src11, src12, src13, + src14, src15, src16, src17, src30, src31, src32, src33, src34, src35, + src36, src37); + + // load dest + LOAD_INPUT16(vec_vsx_ld, dest, 0, stride, dest0, dest1, dest2, dest3, dest4, + dest5, dest6, dest7, dest8, dest9, destA, destB, destC, destD, + destE, destF); + + PIXEL_ADD_STORE16(src00, src10, dest0, 0); + PIXEL_ADD_STORE16(src01, src11, dest1, stride); + PIXEL_ADD_STORE16(src02, src12, dest2, 2 * stride); + PIXEL_ADD_STORE16(src03, src13, dest3, 3 * stride); + PIXEL_ADD_STORE16(src04, src14, dest4, 4 * stride); + PIXEL_ADD_STORE16(src05, src15, dest5, 5 * stride); + PIXEL_ADD_STORE16(src06, src16, dest6, 6 * stride); + PIXEL_ADD_STORE16(src07, src17, dest7, 7 * stride); + + PIXEL_ADD_STORE16(src20, src30, dest8, 8 * stride); + PIXEL_ADD_STORE16(src21, src31, dest9, 9 * stride); + PIXEL_ADD_STORE16(src22, src32, destA, 10 * stride); + PIXEL_ADD_STORE16(src23, src33, destB, 11 * stride); + PIXEL_ADD_STORE16(src24, src34, destC, 12 * stride); + PIXEL_ADD_STORE16(src25, src35, destD, 13 * stride); + PIXEL_ADD_STORE16(src26, src36, destE, 14 * stride); + PIXEL_ADD_STORE16(src27, src37, destF, 15 * stride); +} + +#define LOAD_8x32(load, in00, in01, in02, in03, in10, in11, in12, in13, in20, \ + in21, in22, in23, in30, in31, in32, in33, in40, in41, in42, \ + in43, in50, in51, in52, in53, in60, in61, in62, in63, in70, \ + in71, in72, in73, offset) \ + /* load the first row from the 8x32 block*/ \ + in00 = load(offset, input); \ + in01 = load(offset + 16, input); \ + in02 = load(offset + 2 * 16, input); \ + in03 = load(offset + 3 * 16, input); \ + \ + in10 = load(offset + 4 * 16, input); \ + in11 = load(offset + 5 * 16, input); \ + in12 = load(offset + 6 * 16, input); \ + in13 = load(offset + 7 * 16, input); \ + \ + in20 = load(offset + 8 * 16, input); \ + in21 = load(offset + 9 * 16, input); \ + in22 = load(offset + 10 * 16, input); \ + in23 = load(offset + 11 * 16, input); \ + \ + in30 = load(offset + 12 * 16, input); \ + in31 = load(offset + 13 * 16, input); \ + in32 = load(offset + 14 * 16, input); \ + in33 = load(offset + 15 * 16, input); \ + \ + in40 = load(offset + 16 * 16, input); \ + in41 = load(offset + 17 * 16, input); \ + in42 = load(offset + 18 * 16, input); \ + in43 = load(offset + 19 * 16, input); \ + \ + in50 = load(offset + 20 * 16, input); \ + in51 = load(offset + 21 * 16, input); \ + in52 = load(offset + 22 * 16, input); \ + in53 = load(offset + 23 * 16, input); \ + \ + in60 = load(offset + 24 * 16, input); \ + in61 = load(offset + 25 * 16, input); \ + in62 = load(offset + 26 * 16, input); \ + in63 = load(offset + 27 * 16, input); \ + \ + /* load the last row from the 8x32 block*/ \ + in70 = load(offset + 28 * 16, input); \ + in71 = load(offset + 29 * 16, input); \ + in72 = load(offset + 30 * 16, input); \ + in73 = load(offset + 31 * 16, input); + +/* for the: temp1 = -step[x] * cospi_q + step[y] * cospi_z + * temp2 = step[x] * cospi_z + step[y] * cospi_q */ +#define STEP32(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1) \ + tmp16_0 = vec_mergeh(inpt0, inpt1); \ + tmp16_1 = vec_mergel(inpt0, inpt1); \ + temp10 = vec_sub(vec_mulo(tmp16_0, cospi1), vec_mule(tmp16_0, cospi0)); \ + temp11 = vec_sub(vec_mulo(tmp16_1, cospi1), vec_mule(tmp16_1, cospi0)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt0 = vec_packs(temp10, temp11); \ + temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \ + temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt1 = vec_packs(temp10, temp11); + +/* for the: temp1 = -step[x] * cospi_q - step[y] * cospi_z + * temp2 = -step[x] * cospi_z + step[y] * cospi_q */ +#define STEP32_1(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1, cospi1m) \ + tmp16_0 = vec_mergeh(inpt0, inpt1); \ + tmp16_1 = vec_mergel(inpt0, inpt1); \ + temp10 = vec_sub(vec_mulo(tmp16_0, cospi1m), vec_mule(tmp16_0, cospi0)); \ + temp11 = vec_sub(vec_mulo(tmp16_1, cospi1m), vec_mule(tmp16_1, cospi0)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt0 = vec_packs(temp10, temp11); \ + temp10 = vec_sub(vec_mulo(tmp16_0, cospi0), vec_mule(tmp16_0, cospi1)); \ + temp11 = vec_sub(vec_mulo(tmp16_1, cospi0), vec_mule(tmp16_1, cospi1)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt1 = vec_packs(temp10, temp11); + +#define IDCT32(in0, in1, in2, in3, out) \ + \ + /* stage 1 */ \ + /* out[0][0] = in[0][0]; */ \ + out[0][1] = in2[0]; \ + out[0][2] = in1[0]; \ + out[0][3] = in3[0]; \ + out[0][4] = in0[4]; \ + out[0][5] = in2[4]; \ + out[0][6] = in1[4]; \ + out[0][7] = in3[4]; \ + out[1][0] = in0[2]; \ + out[1][1] = in2[2]; \ + out[1][2] = in1[2]; \ + out[1][3] = in3[2]; \ + out[1][4] = in0[6]; \ + out[1][5] = in2[6]; \ + out[1][6] = in1[6]; \ + out[1][7] = in3[6]; \ + \ + STEP8_0(in0[1], in3[7], out[2][0], out[3][7], cospi31_v, cospi1_v); \ + STEP8_0(in2[1], in1[7], out[2][1], out[3][6], cospi15_v, cospi17_v); \ + STEP8_0(in1[1], in2[7], out[2][2], out[3][5], cospi23_v, cospi9_v); \ + STEP8_0(in3[1], in0[7], out[2][3], out[3][4], cospi7_v, cospi25_v); \ + STEP8_0(in0[5], in3[3], out[2][4], out[3][3], cospi27_v, cospi5_v); \ + STEP8_0(in2[5], in1[3], out[2][5], out[3][2], cospi11_v, cospi21_v); \ + STEP8_0(in1[5], in2[3], out[2][6], out[3][1], cospi19_v, cospi13_v); \ + STEP8_0(in3[5], in0[3], out[2][7], out[3][0], cospi3_v, cospi29_v); \ + \ + /* stage 2 */ \ + /* in0[0] = out[0][0]; */ \ + in0[1] = out[0][1]; \ + in0[2] = out[0][2]; \ + in0[3] = out[0][3]; \ + in0[4] = out[0][4]; \ + in0[5] = out[0][5]; \ + in0[6] = out[0][6]; \ + in0[7] = out[0][7]; \ + \ + STEP8_0(out[1][0], out[1][7], in1[0], in1[7], cospi30_v, cospi2_v); \ + STEP8_0(out[1][1], out[1][6], in1[1], in1[6], cospi14_v, cospi18_v); \ + STEP8_0(out[1][2], out[1][5], in1[2], in1[5], cospi22_v, cospi10_v); \ + STEP8_0(out[1][3], out[1][4], in1[3], in1[4], cospi6_v, cospi26_v); \ + \ + in2[0] = vec_add(out[2][0], out[2][1]); \ + in2[1] = vec_sub(out[2][0], out[2][1]); \ + in2[2] = vec_sub(out[2][3], out[2][2]); \ + in2[3] = vec_add(out[2][3], out[2][2]); \ + in2[4] = vec_add(out[2][4], out[2][5]); \ + in2[5] = vec_sub(out[2][4], out[2][5]); \ + in2[6] = vec_sub(out[2][7], out[2][6]); \ + in2[7] = vec_add(out[2][7], out[2][6]); \ + in3[0] = vec_add(out[3][0], out[3][1]); \ + in3[1] = vec_sub(out[3][0], out[3][1]); \ + in3[2] = vec_sub(out[3][3], out[3][2]); \ + in3[3] = vec_add(out[3][3], out[3][2]); \ + in3[4] = vec_add(out[3][4], out[3][5]); \ + in3[5] = vec_sub(out[3][4], out[3][5]); \ + in3[6] = vec_sub(out[3][7], out[3][6]); \ + in3[7] = vec_add(out[3][6], out[3][7]); \ + \ + /* stage 3 */ \ + out[0][0] = in0[0]; \ + out[0][1] = in0[1]; \ + out[0][2] = in0[2]; \ + out[0][3] = in0[3]; \ + \ + STEP8_0(in0[4], in0[7], out[0][4], out[0][7], cospi28_v, cospi4_v); \ + STEP8_0(in0[5], in0[6], out[0][5], out[0][6], cospi12_v, cospi20_v); \ + \ + out[1][0] = vec_add(in1[0], in1[1]); \ + out[1][1] = vec_sub(in1[0], in1[1]); \ + out[1][2] = vec_sub(in1[3], in1[2]); \ + out[1][3] = vec_add(in1[2], in1[3]); \ + out[1][4] = vec_add(in1[4], in1[5]); \ + out[1][5] = vec_sub(in1[4], in1[5]); \ + out[1][6] = vec_sub(in1[7], in1[6]); \ + out[1][7] = vec_add(in1[6], in1[7]); \ + \ + out[2][0] = in2[0]; \ + out[3][7] = in3[7]; \ + STEP32(in2[1], in3[6], out[2][1], out[3][6], cospi4_v, cospi28_v); \ + STEP32_1(in2[2], in3[5], out[2][2], out[3][5], cospi28_v, cospi4_v, \ + cospi4m_v); \ + out[2][3] = in2[3]; \ + out[2][4] = in2[4]; \ + STEP32(in2[5], in3[2], out[2][5], out[3][2], cospi20_v, cospi12_v); \ + STEP32_1(in2[6], in3[1], out[2][6], out[3][1], cospi12_v, cospi20_v, \ + cospi20m_v); \ + out[2][7] = in2[7]; \ + out[3][0] = in3[0]; \ + out[3][3] = in3[3]; \ + out[3][4] = in3[4]; \ + \ + /* stage 4 */ \ + STEP16_1(out[0][0], out[0][1], in0[1], in0[0], cospi16_v); \ + STEP8_0(out[0][2], out[0][3], in0[2], in0[3], cospi24_v, cospi8_v); \ + in0[4] = vec_add(out[0][4], out[0][5]); \ + in0[5] = vec_sub(out[0][4], out[0][5]); \ + in0[6] = vec_sub(out[0][7], out[0][6]); \ + in0[7] = vec_add(out[0][7], out[0][6]); \ + \ + in1[0] = out[1][0]; \ + in1[7] = out[1][7]; \ + STEP32(out[1][1], out[1][6], in1[1], in1[6], cospi8_v, cospi24_v); \ + STEP32_1(out[1][2], out[1][5], in1[2], in1[5], cospi24_v, cospi8_v, \ + cospi8m_v); \ + in1[3] = out[1][3]; \ + in1[4] = out[1][4]; \ + \ + in2[0] = vec_add(out[2][0], out[2][3]); \ + in2[1] = vec_add(out[2][1], out[2][2]); \ + in2[2] = vec_sub(out[2][1], out[2][2]); \ + in2[3] = vec_sub(out[2][0], out[2][3]); \ + in2[4] = vec_sub(out[2][7], out[2][4]); \ + in2[5] = vec_sub(out[2][6], out[2][5]); \ + in2[6] = vec_add(out[2][5], out[2][6]); \ + in2[7] = vec_add(out[2][4], out[2][7]); \ + \ + in3[0] = vec_add(out[3][0], out[3][3]); \ + in3[1] = vec_add(out[3][1], out[3][2]); \ + in3[2] = vec_sub(out[3][1], out[3][2]); \ + in3[3] = vec_sub(out[3][0], out[3][3]); \ + in3[4] = vec_sub(out[3][7], out[3][4]); \ + in3[5] = vec_sub(out[3][6], out[3][5]); \ + in3[6] = vec_add(out[3][5], out[3][6]); \ + in3[7] = vec_add(out[3][4], out[3][7]); \ + \ + /* stage 5 */ \ + out[0][0] = vec_add(in0[0], in0[3]); \ + out[0][1] = vec_add(in0[1], in0[2]); \ + out[0][2] = vec_sub(in0[1], in0[2]); \ + out[0][3] = vec_sub(in0[0], in0[3]); \ + out[0][4] = in0[4]; \ + STEP16_1(in0[6], in0[5], out[0][5], out[0][6], cospi16_v); \ + out[0][7] = in0[7]; \ + \ + out[1][0] = vec_add(in1[0], in1[3]); \ + out[1][1] = vec_add(in1[1], in1[2]); \ + out[1][2] = vec_sub(in1[1], in1[2]); \ + out[1][3] = vec_sub(in1[0], in1[3]); \ + out[1][4] = vec_sub(in1[7], in1[4]); \ + out[1][5] = vec_sub(in1[6], in1[5]); \ + out[1][6] = vec_add(in1[5], in1[6]); \ + out[1][7] = vec_add(in1[4], in1[7]); \ + \ + out[2][0] = in2[0]; \ + out[2][1] = in2[1]; \ + STEP32(in2[2], in3[5], out[2][2], out[3][5], cospi8_v, cospi24_v); \ + STEP32(in2[3], in3[4], out[2][3], out[3][4], cospi8_v, cospi24_v); \ + STEP32_1(in2[4], in3[3], out[2][4], out[3][3], cospi24_v, cospi8_v, \ + cospi8m_v); \ + STEP32_1(in2[5], in3[2], out[2][5], out[3][2], cospi24_v, cospi8_v, \ + cospi8m_v); \ + out[2][6] = in2[6]; \ + out[2][7] = in2[7]; \ + out[3][0] = in3[0]; \ + out[3][1] = in3[1]; \ + out[3][6] = in3[6]; \ + out[3][7] = in3[7]; \ + \ + /* stage 6 */ \ + in0[0] = vec_add(out[0][0], out[0][7]); \ + in0[1] = vec_add(out[0][1], out[0][6]); \ + in0[2] = vec_add(out[0][2], out[0][5]); \ + in0[3] = vec_add(out[0][3], out[0][4]); \ + in0[4] = vec_sub(out[0][3], out[0][4]); \ + in0[5] = vec_sub(out[0][2], out[0][5]); \ + in0[6] = vec_sub(out[0][1], out[0][6]); \ + in0[7] = vec_sub(out[0][0], out[0][7]); \ + in1[0] = out[1][0]; \ + in1[1] = out[1][1]; \ + STEP16_1(out[1][5], out[1][2], in1[2], in1[5], cospi16_v); \ + STEP16_1(out[1][4], out[1][3], in1[3], in1[4], cospi16_v); \ + in1[6] = out[1][6]; \ + in1[7] = out[1][7]; \ + \ + in2[0] = vec_add(out[2][0], out[2][7]); \ + in2[1] = vec_add(out[2][1], out[2][6]); \ + in2[2] = vec_add(out[2][2], out[2][5]); \ + in2[3] = vec_add(out[2][3], out[2][4]); \ + in2[4] = vec_sub(out[2][3], out[2][4]); \ + in2[5] = vec_sub(out[2][2], out[2][5]); \ + in2[6] = vec_sub(out[2][1], out[2][6]); \ + in2[7] = vec_sub(out[2][0], out[2][7]); \ + \ + in3[0] = vec_sub(out[3][7], out[3][0]); \ + in3[1] = vec_sub(out[3][6], out[3][1]); \ + in3[2] = vec_sub(out[3][5], out[3][2]); \ + in3[3] = vec_sub(out[3][4], out[3][3]); \ + in3[4] = vec_add(out[3][4], out[3][3]); \ + in3[5] = vec_add(out[3][5], out[3][2]); \ + in3[6] = vec_add(out[3][6], out[3][1]); \ + in3[7] = vec_add(out[3][7], out[3][0]); \ + \ + /* stage 7 */ \ + out[0][0] = vec_add(in0[0], in1[7]); \ + out[0][1] = vec_add(in0[1], in1[6]); \ + out[0][2] = vec_add(in0[2], in1[5]); \ + out[0][3] = vec_add(in0[3], in1[4]); \ + out[0][4] = vec_add(in0[4], in1[3]); \ + out[0][5] = vec_add(in0[5], in1[2]); \ + out[0][6] = vec_add(in0[6], in1[1]); \ + out[0][7] = vec_add(in0[7], in1[0]); \ + out[1][0] = vec_sub(in0[7], in1[0]); \ + out[1][1] = vec_sub(in0[6], in1[1]); \ + out[1][2] = vec_sub(in0[5], in1[2]); \ + out[1][3] = vec_sub(in0[4], in1[3]); \ + out[1][4] = vec_sub(in0[3], in1[4]); \ + out[1][5] = vec_sub(in0[2], in1[5]); \ + out[1][6] = vec_sub(in0[1], in1[6]); \ + out[1][7] = vec_sub(in0[0], in1[7]); \ + \ + out[2][0] = in2[0]; \ + out[2][1] = in2[1]; \ + out[2][2] = in2[2]; \ + out[2][3] = in2[3]; \ + STEP16_1(in3[3], in2[4], out[2][4], out[3][3], cospi16_v); \ + STEP16_1(in3[2], in2[5], out[2][5], out[3][2], cospi16_v); \ + STEP16_1(in3[1], in2[6], out[2][6], out[3][1], cospi16_v); \ + STEP16_1(in3[0], in2[7], out[2][7], out[3][0], cospi16_v); \ + out[3][4] = in3[4]; \ + out[3][5] = in3[5]; \ + out[3][6] = in3[6]; \ + out[3][7] = in3[7]; \ + \ + /* final */ \ + in0[0] = vec_add(out[0][0], out[3][7]); \ + in0[1] = vec_add(out[0][1], out[3][6]); \ + in0[2] = vec_add(out[0][2], out[3][5]); \ + in0[3] = vec_add(out[0][3], out[3][4]); \ + in0[4] = vec_add(out[0][4], out[3][3]); \ + in0[5] = vec_add(out[0][5], out[3][2]); \ + in0[6] = vec_add(out[0][6], out[3][1]); \ + in0[7] = vec_add(out[0][7], out[3][0]); \ + in1[0] = vec_add(out[1][0], out[2][7]); \ + in1[1] = vec_add(out[1][1], out[2][6]); \ + in1[2] = vec_add(out[1][2], out[2][5]); \ + in1[3] = vec_add(out[1][3], out[2][4]); \ + in1[4] = vec_add(out[1][4], out[2][3]); \ + in1[5] = vec_add(out[1][5], out[2][2]); \ + in1[6] = vec_add(out[1][6], out[2][1]); \ + in1[7] = vec_add(out[1][7], out[2][0]); \ + in2[0] = vec_sub(out[1][7], out[2][0]); \ + in2[1] = vec_sub(out[1][6], out[2][1]); \ + in2[2] = vec_sub(out[1][5], out[2][2]); \ + in2[3] = vec_sub(out[1][4], out[2][3]); \ + in2[4] = vec_sub(out[1][3], out[2][4]); \ + in2[5] = vec_sub(out[1][2], out[2][5]); \ + in2[6] = vec_sub(out[1][1], out[2][6]); \ + in2[7] = vec_sub(out[1][0], out[2][7]); \ + in3[0] = vec_sub(out[0][7], out[3][0]); \ + in3[1] = vec_sub(out[0][6], out[3][1]); \ + in3[2] = vec_sub(out[0][5], out[3][2]); \ + in3[3] = vec_sub(out[0][4], out[3][3]); \ + in3[4] = vec_sub(out[0][3], out[3][4]); \ + in3[5] = vec_sub(out[0][2], out[3][5]); \ + in3[6] = vec_sub(out[0][1], out[3][6]); \ + in3[7] = vec_sub(out[0][0], out[3][7]); + +// NOT A FULL TRANSPOSE! Transposes just each 8x8 block in each row, +// does not transpose rows +#define TRANSPOSE_8x32(in, out) \ + /* transpose 4 of 8x8 blocks */ \ + TRANSPOSE8x8(in[0][0], in[0][1], in[0][2], in[0][3], in[0][4], in[0][5], \ + in[0][6], in[0][7], out[0][0], out[0][1], out[0][2], out[0][3], \ + out[0][4], out[0][5], out[0][6], out[0][7]); \ + TRANSPOSE8x8(in[1][0], in[1][1], in[1][2], in[1][3], in[1][4], in[1][5], \ + in[1][6], in[1][7], out[1][0], out[1][1], out[1][2], out[1][3], \ + out[1][4], out[1][5], out[1][6], out[1][7]); \ + TRANSPOSE8x8(in[2][0], in[2][1], in[2][2], in[2][3], in[2][4], in[2][5], \ + in[2][6], in[2][7], out[2][0], out[2][1], out[2][2], out[2][3], \ + out[2][4], out[2][5], out[2][6], out[2][7]); \ + TRANSPOSE8x8(in[3][0], in[3][1], in[3][2], in[3][3], in[3][4], in[3][5], \ + in[3][6], in[3][7], out[3][0], out[3][1], out[3][2], out[3][3], \ + out[3][4], out[3][5], out[3][6], out[3][7]); + +#define PIXEL_ADD_STORE32(in0, in1, in2, in3, step) \ + dst = vec_vsx_ld((step)*stride, dest); \ + d_uh = (int16x8_t)vec_mergeh(dst, zerov); \ + d_ul = (int16x8_t)vec_mergel(dst, zerov); \ + PIXEL_ADD(in0, d_uh, add, shift6); \ + PIXEL_ADD(in1, d_ul, add, shift6); \ + vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride, dest); \ + dst = vec_vsx_ld((step)*stride + 16, dest); \ + d_uh = (int16x8_t)vec_mergeh(dst, zerov); \ + d_ul = (int16x8_t)vec_mergel(dst, zerov); \ + PIXEL_ADD(in2, d_uh, add, shift6); \ + PIXEL_ADD(in3, d_ul, add, shift6); \ + vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride + 16, dest); + +#define ADD_STORE_BLOCK(in, offset) \ + PIXEL_ADD_STORE32(in[0][0], in[1][0], in[2][0], in[3][0], offset + 0); \ + PIXEL_ADD_STORE32(in[0][1], in[1][1], in[2][1], in[3][1], offset + 1); \ + PIXEL_ADD_STORE32(in[0][2], in[1][2], in[2][2], in[3][2], offset + 2); \ + PIXEL_ADD_STORE32(in[0][3], in[1][3], in[2][3], in[3][3], offset + 3); \ + PIXEL_ADD_STORE32(in[0][4], in[1][4], in[2][4], in[3][4], offset + 4); \ + PIXEL_ADD_STORE32(in[0][5], in[1][5], in[2][5], in[3][5], offset + 5); \ + PIXEL_ADD_STORE32(in[0][6], in[1][6], in[2][6], in[3][6], offset + 6); \ + PIXEL_ADD_STORE32(in[0][7], in[1][7], in[2][7], in[3][7], offset + 7); + +void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t src0[4][8], src1[4][8], src2[4][8], src3[4][8], tmp[4][8]; + int16x8_t tmp16_0, tmp16_1; + int32x4_t temp10, temp11, temp20, temp21, temp30; + uint8x16_t dst; + int16x8_t d_uh, d_ul; + int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2)); + uint16x8_t shift6 = vec_splat_u16(6); + uint8x16_t zerov = vec_splat_u8(0); + + ROUND_SHIFT_INIT; + + LOAD_8x32(load_tran_low, src0[0][0], src0[1][0], src0[2][0], src0[3][0], + src0[0][1], src0[1][1], src0[2][1], src0[3][1], src0[0][2], + src0[1][2], src0[2][2], src0[3][2], src0[0][3], src0[1][3], + src0[2][3], src0[3][3], src0[0][4], src0[1][4], src0[2][4], + src0[3][4], src0[0][5], src0[1][5], src0[2][5], src0[3][5], + src0[0][6], src0[1][6], src0[2][6], src0[3][6], src0[0][7], + src0[1][7], src0[2][7], src0[3][7], 0); + // Rows + // transpose the first row of 8x8 blocks + TRANSPOSE_8x32(src0, tmp); + // transform the 32x8 column + IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src0); + TRANSPOSE_8x32(tmp, src0); + + LOAD_8x32(load_tran_low, src1[0][0], src1[1][0], src1[2][0], src1[3][0], + src1[0][1], src1[1][1], src1[2][1], src1[3][1], src1[0][2], + src1[1][2], src1[2][2], src1[3][2], src1[0][3], src1[1][3], + src1[2][3], src1[3][3], src1[0][4], src1[1][4], src1[2][4], + src1[3][4], src1[0][5], src1[1][5], src1[2][5], src1[3][5], + src1[0][6], src1[1][6], src1[2][6], src1[3][6], src1[0][7], + src1[1][7], src1[2][7], src1[3][7], 512); + TRANSPOSE_8x32(src1, tmp); + IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src1); + TRANSPOSE_8x32(tmp, src1); + + LOAD_8x32(load_tran_low, src2[0][0], src2[1][0], src2[2][0], src2[3][0], + src2[0][1], src2[1][1], src2[2][1], src2[3][1], src2[0][2], + src2[1][2], src2[2][2], src2[3][2], src2[0][3], src2[1][3], + src2[2][3], src2[3][3], src2[0][4], src2[1][4], src2[2][4], + src2[3][4], src2[0][5], src2[1][5], src2[2][5], src2[3][5], + src2[0][6], src2[1][6], src2[2][6], src2[3][6], src2[0][7], + src2[1][7], src2[2][7], src2[3][7], 1024); + TRANSPOSE_8x32(src2, tmp); + IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src2); + TRANSPOSE_8x32(tmp, src2); + + LOAD_8x32(load_tran_low, src3[0][0], src3[1][0], src3[2][0], src3[3][0], + src3[0][1], src3[1][1], src3[2][1], src3[3][1], src3[0][2], + src3[1][2], src3[2][2], src3[3][2], src3[0][3], src3[1][3], + src3[2][3], src3[3][3], src3[0][4], src3[1][4], src3[2][4], + src3[3][4], src3[0][5], src3[1][5], src3[2][5], src3[3][5], + src3[0][6], src3[1][6], src3[2][6], src3[3][6], src3[0][7], + src3[1][7], src3[2][7], src3[3][7], 1536); + TRANSPOSE_8x32(src3, tmp); + IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src3); + TRANSPOSE_8x32(tmp, src3); + + // Columns + IDCT32(src0[0], src1[0], src2[0], src3[0], tmp); + IDCT32(src0[1], src1[1], src2[1], src3[1], tmp); + IDCT32(src0[2], src1[2], src2[2], src3[2], tmp); + IDCT32(src0[3], src1[3], src2[3], src3[3], tmp); + + ADD_STORE_BLOCK(src0, 0); + ADD_STORE_BLOCK(src1, 8); + ADD_STORE_BLOCK(src2, 16); + ADD_STORE_BLOCK(src3, 24); +} diff --git a/libvpx/vpx_dsp/ppc/sad_vsx.c b/libvpx/vpx_dsp/ppc/sad_vsx.c index 3edb40c31..bb49addae 100644 --- a/libvpx/vpx_dsp/ppc/sad_vsx.c +++ b/libvpx/vpx_dsp/ppc/sad_vsx.c @@ -10,9 +10,12 @@ #include <stdlib.h> +#include "./vpx_dsp_rtcd.h" + #include "vpx_dsp/ppc/types_vsx.h" #include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" #define PROCESS16(offset) \ v_a = vec_vsx_ld(offset, a); \ @@ -100,3 +103,152 @@ SAD32(32); SAD32(64); SAD64(32); SAD64(64); + +#define SAD16AVG(height) \ + unsigned int vpx_sad16x##height##_avg_vsx( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + DECLARE_ALIGNED(16, uint8_t, comp_pred[16 * height]); \ + vpx_comp_avg_pred_vsx(comp_pred, second_pred, 16, height, ref, \ + ref_stride); \ + \ + return vpx_sad16x##height##_vsx(src, src_stride, comp_pred, 16); \ + } + +#define SAD32AVG(height) \ + unsigned int vpx_sad32x##height##_avg_vsx( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + DECLARE_ALIGNED(32, uint8_t, comp_pred[32 * height]); \ + vpx_comp_avg_pred_vsx(comp_pred, second_pred, 32, height, ref, \ + ref_stride); \ + \ + return vpx_sad32x##height##_vsx(src, src_stride, comp_pred, 32); \ + } + +#define SAD64AVG(height) \ + unsigned int vpx_sad64x##height##_avg_vsx( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + DECLARE_ALIGNED(64, uint8_t, comp_pred[64 * height]); \ + vpx_comp_avg_pred_vsx(comp_pred, second_pred, 64, height, ref, \ + ref_stride); \ + return vpx_sad64x##height##_vsx(src, src_stride, comp_pred, 64); \ + } + +SAD16AVG(8); +SAD16AVG(16); +SAD16AVG(32); +SAD32AVG(16); +SAD32AVG(32); +SAD32AVG(64); +SAD64AVG(32); +SAD64AVG(64); + +#define PROCESS16_4D(offset, ref, v_h, v_l) \ + v_b = vec_vsx_ld(offset, ref); \ + v_bh = unpack_to_s16_h(v_b); \ + v_bl = unpack_to_s16_l(v_b); \ + v_subh = vec_sub(v_h, v_bh); \ + v_subl = vec_sub(v_l, v_bl); \ + v_absh = vec_abs(v_subh); \ + v_absl = vec_abs(v_subl); \ + v_sad = vec_sum4s(v_absh, v_sad); \ + v_sad = vec_sum4s(v_absl, v_sad); + +#define UNPACK_SRC(offset, srcv_h, srcv_l) \ + v_a = vec_vsx_ld(offset, src); \ + srcv_h = unpack_to_s16_h(v_a); \ + srcv_l = unpack_to_s16_l(v_a); + +#define SAD16_4D(height) \ + void vpx_sad16x##height##x4d_vsx(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + int y; \ + unsigned int sad[4]; \ + uint8x16_t v_a, v_b; \ + int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \ + \ + for (i = 0; i < 4; i++) sad_array[i] = 0; \ + \ + for (y = 0; y < height; y++) { \ + UNPACK_SRC(y *src_stride, v_ah, v_al); \ + for (i = 0; i < 4; i++) { \ + int32x4_t v_sad = vec_splat_s32(0); \ + PROCESS16_4D(y *ref_stride, ref_array[i], v_ah, v_al); \ + \ + vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \ + } \ + } \ + } + +#define SAD32_4D(height) \ + void vpx_sad32x##height##x4d_vsx(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + int y; \ + unsigned int sad[4]; \ + uint8x16_t v_a, v_b; \ + int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl; \ + int16x8_t v_absh, v_absl, v_subh, v_subl; \ + \ + for (i = 0; i < 4; i++) sad_array[i] = 0; \ + \ + for (y = 0; y < height; y++) { \ + UNPACK_SRC(y *src_stride, v_ah1, v_al1); \ + UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2); \ + for (i = 0; i < 4; i++) { \ + int32x4_t v_sad = vec_splat_s32(0); \ + PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1); \ + PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2); \ + \ + vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \ + } \ + } \ + } + +#define SAD64_4D(height) \ + void vpx_sad64x##height##x4d_vsx(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + int y; \ + unsigned int sad[4]; \ + uint8x16_t v_a, v_b; \ + int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl; \ + int16x8_t v_ah3, v_al3, v_ah4, v_al4; \ + int16x8_t v_absh, v_absl, v_subh, v_subl; \ + \ + for (i = 0; i < 4; i++) sad_array[i] = 0; \ + \ + for (y = 0; y < height; y++) { \ + UNPACK_SRC(y *src_stride, v_ah1, v_al1); \ + UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2); \ + UNPACK_SRC(y *src_stride + 32, v_ah3, v_al3); \ + UNPACK_SRC(y *src_stride + 48, v_ah4, v_al4); \ + for (i = 0; i < 4; i++) { \ + int32x4_t v_sad = vec_splat_s32(0); \ + PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1); \ + PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2); \ + PROCESS16_4D(y *ref_stride + 32, ref_array[i], v_ah3, v_al3); \ + PROCESS16_4D(y *ref_stride + 48, ref_array[i], v_ah4, v_al4); \ + \ + vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \ + } \ + } \ + } + +SAD16_4D(8); +SAD16_4D(16); +SAD16_4D(32); +SAD32_4D(16); +SAD32_4D(32); +SAD32_4D(64); +SAD64_4D(32); +SAD64_4D(64); diff --git a/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c b/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c index 55dcdc2ba..5c3ba4576 100644 --- a/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c +++ b/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c @@ -53,13 +53,13 @@ static inline void copy_w64(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int32_t filter_x_stride, - const int16_t *filter_y, int32_t filter_y_stride, - int32_t w, int32_t h) { - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; switch (w) { case 16: { @@ -132,14 +132,8 @@ static inline void avg_w64(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int32_t filter_x_stride, - const int16_t *filter_y, int32_t filter_y_stride, - int32_t w, int32_t h) { - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; - + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { switch (w) { case 16: { avg_w16(src, src_stride, dst, dst_stride, h); @@ -154,8 +148,8 @@ void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, break; } default: { - vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter_x, - filter_x_stride, filter_y, filter_y_stride, w, h); + vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } @@ -299,9 +293,9 @@ static inline void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, static inline void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, int x0_q4, - int x_step_q4, const InterpKernel *const y_filters, - int y0_q4, int y_step_q4, int w, int h) { + const InterpKernel *const filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { // Note: Fixed size intermediate buffer, temp, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. @@ -324,95 +318,77 @@ static inline void convolve(const uint8_t *src, ptrdiff_t src_stride, assert(x_step_q4 <= 32); convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, - x_filters, x0_q4, x_step_q4, w, intermediate_height); - convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, - y_filters, y0_q4, y_step_q4, w, h); + filter, x0_q4, x_step_q4, w, intermediate_height); + convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter, + y0_q4, y_step_q4, w, h); } void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; + (void)y0_q4; (void)y_step_q4; - convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, - w, h); + convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w, + h); } void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; + (void)y0_q4; (void)y_step_q4; - convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, - x_step_q4, w, h); + convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + w, h); } void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; + (void)x0_q4; (void)x_step_q4; - convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, - w, h); + convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, + h); } void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; + (void)x0_q4; (void)x_step_q4; - convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, - y_step_q4, w, h); + convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, + w, h); } void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, - filters_y, y0_q4, y_step_q4, w, h); + convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); } void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { // Fixed size intermediate buffer places limits on parameters. DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); assert(w <= 64); assert(h <= 64); - vpx_convolve8_vsx(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y, + vpx_convolve8_vsx(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); - vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h); + vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h); } diff --git a/libvpx/vpx_dsp/quantize.c b/libvpx/vpx_dsp/quantize.c index 3c7f9832f..e37ca92ad 100644 --- a/libvpx/vpx_dsp/quantize.c +++ b/libvpx/vpx_dsp/quantize.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <assert.h> + #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/quantize.h" #include "vpx_mem/vpx_mem.h" @@ -123,40 +125,40 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; (void)iscan; + (void)skip_block; + assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = (int)n_coeffs - 1; i >= 0; i--) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - - if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) - non_zero_count--; - else - break; - } + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + + if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) + non_zero_count--; + else + break; + } - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < non_zero_count; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - - if (abs_coeff >= zbins[rc != 0]) { - int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); - tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * - quant_shift_ptr[rc != 0]) >> - 16; // quantization - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; - - if (tmp) eob = i; - } + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (abs_coeff >= zbins[rc != 0]) { + int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> + 16; // quantization + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + + if (tmp) eob = i; } } *eob_ptr = eob + 1; @@ -174,39 +176,38 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; (void)iscan; + (void)skip_block; + assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = (int)n_coeffs - 1; i >= 0; i--) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - - if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) - non_zero_count--; - else - break; - } + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + + if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) + non_zero_count--; + else + break; + } - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < non_zero_count; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - - if (abs_coeff >= zbins[rc != 0]) { - const int64_t tmp1 = abs_coeff + round_ptr[rc != 0]; - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; - if (abs_qcoeff) eob = i; - } + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (abs_coeff >= zbins[rc != 0]) { + const int64_t tmp1 = abs_coeff + round_ptr[rc != 0]; + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 16); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + if (abs_qcoeff) eob = i; } } *eob_ptr = eob + 1; @@ -228,41 +229,40 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int idx_arr[1024]; int i, eob = -1; (void)iscan; + (void)skip_block; + assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - - // If the coefficient is out of the base ZBIN range, keep it for - // quantization. - if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) - idx_arr[idx++] = i; - } + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = scan[idx_arr[i]]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - int tmp; - int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); - tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) * - quant_shift_ptr[rc != 0]) >> - 15; + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i; + } - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + int tmp; + int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); + tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) * + quant_shift_ptr[rc != 0]) >> + 15; - if (tmp) eob = idx_arr[i]; - } + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + + if (tmp) eob = idx_arr[i]; } *eob_ptr = eob + 1; } @@ -282,38 +282,35 @@ void vpx_highbd_quantize_b_32x32_c( int idx_arr[1024]; int i, eob = -1; (void)iscan; + (void)skip_block; + assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - - // If the coefficient is out of the base ZBIN range, keep it for - // quantization. - if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) - idx_arr[idx++] = i; - } + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = scan[idx_arr[i]]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - if (abs_qcoeff) eob = idx_arr[i]; - } + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + if (abs_qcoeff) eob = idx_arr[i]; } *eob_ptr = eob + 1; } diff --git a/libvpx/vpx_dsp/sad.c b/libvpx/vpx_dsp/sad.c index 6ceb37e43..18b6dc6e0 100644 --- a/libvpx/vpx_dsp/sad.c +++ b/libvpx/vpx_dsp/sad.c @@ -70,8 +70,6 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b, /* clang-format off */ // 64x64 sadMxN(64, 64) -sadMxNxK(64, 64, 3) -sadMxNxK(64, 64, 8) sadMxNx4D(64, 64) // 64x32 @@ -84,8 +82,6 @@ sadMxNx4D(32, 64) // 32x32 sadMxN(32, 32) -sadMxNxK(32, 32, 3) -sadMxNxK(32, 32, 8) sadMxNx4D(32, 32) // 32x16 @@ -122,12 +118,10 @@ sadMxNx4D(8, 8) // 8x4 sadMxN(8, 4) -sadMxNxK(8, 4, 8) sadMxNx4D(8, 4) // 4x8 sadMxN(4, 8) -sadMxNxK(4, 8, 8) sadMxNx4D(4, 8) // 4x4 @@ -183,17 +177,6 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ } -#define highbd_sadMxNxK(m, n, k) \ - void vpx_highbd_sad##m##x##n##x##k##_c( \ - const uint8_t *src, int src_stride, const uint8_t *ref_array, \ - int ref_stride, uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < k; ++i) { \ - sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, \ - &ref_array[i], ref_stride); \ - } \ - } - #define highbd_sadMxNx4D(m, n) \ void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ const uint8_t *const ref_array[], \ @@ -208,8 +191,6 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, /* clang-format off */ // 64x64 highbd_sadMxN(64, 64) -highbd_sadMxNxK(64, 64, 3) -highbd_sadMxNxK(64, 64, 8) highbd_sadMxNx4D(64, 64) // 64x32 @@ -222,8 +203,6 @@ highbd_sadMxNx4D(32, 64) // 32x32 highbd_sadMxN(32, 32) -highbd_sadMxNxK(32, 32, 3) -highbd_sadMxNxK(32, 32, 8) highbd_sadMxNx4D(32, 32) // 32x16 @@ -236,42 +215,30 @@ highbd_sadMxNx4D(16, 32) // 16x16 highbd_sadMxN(16, 16) -highbd_sadMxNxK(16, 16, 3) -highbd_sadMxNxK(16, 16, 8) highbd_sadMxNx4D(16, 16) // 16x8 highbd_sadMxN(16, 8) -highbd_sadMxNxK(16, 8, 3) -highbd_sadMxNxK(16, 8, 8) highbd_sadMxNx4D(16, 8) // 8x16 highbd_sadMxN(8, 16) -highbd_sadMxNxK(8, 16, 3) -highbd_sadMxNxK(8, 16, 8) highbd_sadMxNx4D(8, 16) // 8x8 highbd_sadMxN(8, 8) -highbd_sadMxNxK(8, 8, 3) -highbd_sadMxNxK(8, 8, 8) highbd_sadMxNx4D(8, 8) // 8x4 highbd_sadMxN(8, 4) -highbd_sadMxNxK(8, 4, 8) highbd_sadMxNx4D(8, 4) // 4x8 highbd_sadMxN(4, 8) -highbd_sadMxNxK(4, 8, 8) highbd_sadMxNx4D(4, 8) // 4x4 highbd_sadMxN(4, 4) -highbd_sadMxNxK(4, 4, 3) -highbd_sadMxNxK(4, 4, 8) highbd_sadMxNx4D(4, 4) /* clang-format on */ diff --git a/libvpx/vpx_dsp/skin_detection.c b/libvpx/vpx_dsp/skin_detection.c new file mode 100644 index 000000000..bbbb6c3a1 --- /dev/null +++ b/libvpx/vpx_dsp/skin_detection.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_dsp/skin_detection.h" + +#define MODEL_MODE 1 + +// Fixed-point skin color model parameters. +static const int skin_mean[5][2] = { { 7463, 9614 }, + { 6400, 10240 }, + { 7040, 10240 }, + { 8320, 9280 }, + { 6800, 9614 } }; +static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 }; // q16 +static const int skin_threshold[6] = { 1570636, 1400000, 800000, + 800000, 800000, 800000 }; // q18 +// Thresholds on luminance. +static const int y_low = 40; +static const int y_high = 220; + +// Evaluates the Mahalanobis distance measure for the input CbCr values. +static int vpx_evaluate_skin_color_difference(const int cb, const int cr, + const int idx) { + const int cb_q6 = cb << 6; + const int cr_q6 = cr << 6; + const int cb_diff_q12 = + (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]); + const int cbcr_diff_q12 = + (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]); + const int cr_diff_q12 = + (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]); + const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10; + const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10; + const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10; + const int skin_diff = + skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 + + skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2; + return skin_diff; +} + +// Checks if the input yCbCr values corresponds to skin color. +int vpx_skin_pixel(const int y, const int cb, const int cr, int motion) { + if (y < y_low || y > y_high) { + return 0; + } else if (MODEL_MODE == 0) { + return (vpx_evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]); + } else { + int i = 0; + // Exit on grey. + if (cb == 128 && cr == 128) return 0; + // Exit on very strong cb. + if (cb > 150 && cr < 110) return 0; + for (; i < 5; ++i) { + int skin_color_diff = vpx_evaluate_skin_color_difference(cb, cr, i); + if (skin_color_diff < skin_threshold[i + 1]) { + if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) { + return 0; + } else if (motion == 0 && + skin_color_diff > (skin_threshold[i + 1] >> 1)) { + return 0; + } else { + return 1; + } + } + // Exit if difference is much large than the threshold. + if (skin_color_diff > (skin_threshold[i + 1] << 3)) { + return 0; + } + } + return 0; + } +} diff --git a/libvpx/vpx_dsp/skin_detection.h b/libvpx/vpx_dsp/skin_detection.h new file mode 100644 index 000000000..a2e99baf7 --- /dev/null +++ b/libvpx/vpx_dsp/skin_detection.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_SKIN_DETECTION_H_ +#define VPX_DSP_SKIN_DETECTION_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +int vpx_skin_pixel(const int y, const int cb, const int cr, int motion); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_DSP_SKIN_DETECTION_H_ diff --git a/libvpx/vpx_dsp/txfm_common.h b/libvpx/vpx_dsp/txfm_common.h index fd27f928e..d01d7085a 100644 --- a/libvpx/vpx_dsp/txfm_common.h +++ b/libvpx/vpx_dsp/txfm_common.h @@ -25,42 +25,42 @@ // printf("static const int cospi_%d_64 = %.0f;\n", i, // round(16384 * cos(i*M_PI/64))); // Note: sin(k*Pi/64) = cos((32-k)*Pi/64) -static const tran_high_t cospi_1_64 = 16364; -static const tran_high_t cospi_2_64 = 16305; -static const tran_high_t cospi_3_64 = 16207; -static const tran_high_t cospi_4_64 = 16069; -static const tran_high_t cospi_5_64 = 15893; -static const tran_high_t cospi_6_64 = 15679; -static const tran_high_t cospi_7_64 = 15426; -static const tran_high_t cospi_8_64 = 15137; -static const tran_high_t cospi_9_64 = 14811; -static const tran_high_t cospi_10_64 = 14449; -static const tran_high_t cospi_11_64 = 14053; -static const tran_high_t cospi_12_64 = 13623; -static const tran_high_t cospi_13_64 = 13160; -static const tran_high_t cospi_14_64 = 12665; -static const tran_high_t cospi_15_64 = 12140; -static const tran_high_t cospi_16_64 = 11585; -static const tran_high_t cospi_17_64 = 11003; -static const tran_high_t cospi_18_64 = 10394; -static const tran_high_t cospi_19_64 = 9760; -static const tran_high_t cospi_20_64 = 9102; -static const tran_high_t cospi_21_64 = 8423; -static const tran_high_t cospi_22_64 = 7723; -static const tran_high_t cospi_23_64 = 7005; -static const tran_high_t cospi_24_64 = 6270; -static const tran_high_t cospi_25_64 = 5520; -static const tran_high_t cospi_26_64 = 4756; -static const tran_high_t cospi_27_64 = 3981; -static const tran_high_t cospi_28_64 = 3196; -static const tran_high_t cospi_29_64 = 2404; -static const tran_high_t cospi_30_64 = 1606; -static const tran_high_t cospi_31_64 = 804; +static const tran_coef_t cospi_1_64 = 16364; +static const tran_coef_t cospi_2_64 = 16305; +static const tran_coef_t cospi_3_64 = 16207; +static const tran_coef_t cospi_4_64 = 16069; +static const tran_coef_t cospi_5_64 = 15893; +static const tran_coef_t cospi_6_64 = 15679; +static const tran_coef_t cospi_7_64 = 15426; +static const tran_coef_t cospi_8_64 = 15137; +static const tran_coef_t cospi_9_64 = 14811; +static const tran_coef_t cospi_10_64 = 14449; +static const tran_coef_t cospi_11_64 = 14053; +static const tran_coef_t cospi_12_64 = 13623; +static const tran_coef_t cospi_13_64 = 13160; +static const tran_coef_t cospi_14_64 = 12665; +static const tran_coef_t cospi_15_64 = 12140; +static const tran_coef_t cospi_16_64 = 11585; +static const tran_coef_t cospi_17_64 = 11003; +static const tran_coef_t cospi_18_64 = 10394; +static const tran_coef_t cospi_19_64 = 9760; +static const tran_coef_t cospi_20_64 = 9102; +static const tran_coef_t cospi_21_64 = 8423; +static const tran_coef_t cospi_22_64 = 7723; +static const tran_coef_t cospi_23_64 = 7005; +static const tran_coef_t cospi_24_64 = 6270; +static const tran_coef_t cospi_25_64 = 5520; +static const tran_coef_t cospi_26_64 = 4756; +static const tran_coef_t cospi_27_64 = 3981; +static const tran_coef_t cospi_28_64 = 3196; +static const tran_coef_t cospi_29_64 = 2404; +static const tran_coef_t cospi_30_64 = 1606; +static const tran_coef_t cospi_31_64 = 804; // 16384 * sqrt(2) * sin(kPi/9) * 2 / 3 -static const tran_high_t sinpi_1_9 = 5283; -static const tran_high_t sinpi_2_9 = 9929; -static const tran_high_t sinpi_3_9 = 13377; -static const tran_high_t sinpi_4_9 = 15212; +static const tran_coef_t sinpi_1_9 = 5283; +static const tran_coef_t sinpi_2_9 = 9929; +static const tran_coef_t sinpi_3_9 = 13377; +static const tran_coef_t sinpi_4_9 = 15212; #endif // VPX_DSP_TXFM_COMMON_H_ diff --git a/libvpx/vpx_dsp/variance.c b/libvpx/vpx_dsp/variance.c index b1744047a..93bd8f30d 100644 --- a/libvpx/vpx_dsp/variance.c +++ b/libvpx/vpx_dsp/variance.c @@ -8,8 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include <assert.h> - #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" @@ -166,7 +164,7 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b, var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ bilinear_filters[yoffset]); \ \ - vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ + vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \ \ return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \ } @@ -226,9 +224,6 @@ MSE(8, 8) void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride) { int i, j; - /* comp_pred and pred must be 16 byte aligned. */ - assert(((intptr_t)comp_pred & 0xf) == 0); - assert(((intptr_t)pred & 0xf) == 0); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { @@ -468,8 +463,8 @@ static void highbd_var_filter_block2d_bil_second_pass( highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ bilinear_filters[yoffset]); \ \ - vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ + vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ \ return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ dst, dst_stride, sse); \ @@ -488,8 +483,8 @@ static void highbd_var_filter_block2d_bil_second_pass( highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ bilinear_filters[yoffset]); \ \ - vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ + vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ \ return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ dst, dst_stride, sse); \ @@ -508,8 +503,8 @@ static void highbd_var_filter_block2d_bil_second_pass( highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ bilinear_filters[yoffset]); \ \ - vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ + vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ \ return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ dst, dst_stride, sse); \ diff --git a/libvpx/vpx_dsp/variance.h b/libvpx/vpx_dsp/variance.h index 4c482551e..100573299 100644 --- a/libvpx/vpx_dsp/variance.h +++ b/libvpx/vpx_dsp/variance.h @@ -74,8 +74,6 @@ typedef struct vp9_variance_vtable { vpx_variance_fn_t vf; vpx_subpixvariance_fn_t svf; vpx_subp_avg_variance_fn_t svaf; - vpx_sad_multi_fn_t sdx3f; - vpx_sad_multi_fn_t sdx8f; vpx_sad_multi_d_fn_t sdx4df; } vp9_variance_fn_ptr_t; #endif // CONFIG_VP9 diff --git a/libvpx/vpx_dsp/vpx_convolve.c b/libvpx/vpx_dsp/vpx_convolve.c index 02c5a955a..e55a963f9 100644 --- a/libvpx/vpx_dsp/vpx_convolve.c +++ b/libvpx/vpx_dsp/vpx_convolve.c @@ -113,135 +113,107 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, } } -static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const InterpKernel *const x_filters, - int x0_q4, int x_step_q4, - const InterpKernel *const y_filters, int y0_q4, - int y_step_q4, int w, int h) { - // Note: Fixed size intermediate buffer, temp, places limits on parameters. - // 2d filtering proceeds in 2 steps: - // (1) Interpolate horizontally into an intermediate buffer, temp. - // (2) Interpolate temp vertically to derive the sub-pixel result. - // Deriving the maximum number of rows in the temp buffer (135): - // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). - // --Largest block size is 64x64 pixels. - // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the - // original frame (in 1/16th pixel units). - // --Must round-up because block may be located at sub-pixel position. - // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. - // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. - uint8_t temp[64 * 135]; - const int intermediate_height = - (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; - - assert(w <= 64); - assert(h <= 64); - assert(y_step_q4 <= 32); - assert(x_step_q4 <= 32); - - convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, - x_filters, x0_q4, x_step_q4, w, intermediate_height); - convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, - y_filters, y0_q4, y_step_q4, w, h); -} - void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + (void)y0_q4; (void)y_step_q4; - - convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, - w, h); + convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w, + h); } void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; + (void)y0_q4; (void)y_step_q4; - - convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, - x_step_q4, w, h); + convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + w, h); } void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + (void)x0_q4; (void)x_step_q4; - - convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, - w, h); + convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, + h); } void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; + (void)x0_q4; (void)x_step_q4; - - convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, - y_step_q4, w, h); + convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, + w, h); } void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, - filters_y, y0_q4, y_step_q4, w, h); + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. + uint8_t temp[64 * 135]; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); + + convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, + filter, x0_q4, x_step_q4, w, intermediate_height); + convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter, + y0_q4, y_step_q4, w, h); } void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { // Fixed size intermediate buffer places limits on parameters. DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); assert(w <= 64); assert(h <= 64); - vpx_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y, + vpx_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); - vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h); + vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h); } void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int filter_x_stride, const int16_t *filter_y, - int filter_y_stride, int w, int h) { + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { int r; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; for (r = h; r > 0; --r) { memcpy(dst, src, w); @@ -251,15 +223,16 @@ void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, } void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int filter_x_stride, const int16_t *filter_y, - int filter_y_stride, int w, int h) { + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { int x, y; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; for (y = 0; y < h; ++y) { for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); @@ -269,53 +242,52 @@ void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, } void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); } void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } #if CONFIG_VP9_HIGHBITDEPTH @@ -417,9 +389,9 @@ static void highbd_convolve_avg_vert(const uint16_t *src, ptrdiff_t src_stride, static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, int x0_q4, - int x_step_q4, const InterpKernel *const y_filters, - int y0_q4, int y_step_q4, int w, int h, int bd) { + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { // Note: Fixed size intermediate buffer, temp, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. @@ -442,113 +414,97 @@ static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride, assert(x_step_q4 <= 32); highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, - temp, 64, x_filters, x0_q4, x_step_q4, w, + temp, 64, filter, x0_q4, x_step_q4, w, intermediate_height, bd); highbd_convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, - y_filters, y0_q4, y_step_q4, w, h, bd); + filter, y0_q4, y_step_q4, w, h, bd); } void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h, int bd) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + (void)y0_q4; (void)y_step_q4; - highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + highbd_convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w, h, bd); } void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; + (void)y0_q4; (void)y_step_q4; - highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w, h, bd); } void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; + (void)x0_q4; (void)x_step_q4; - highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + highbd_convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, h, bd); } void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; + (void)x0_q4; (void)x_step_q4; - highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, h, bd); } void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, - filters_y, y0_q4, y_step_q4, w, h, bd); + highbd_convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h, bd); } void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { // Fixed size intermediate buffer places limits on parameters. DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]); assert(w <= 64); assert(h <= 64); - vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4, - filter_y, y_step_q4, w, h, bd); - vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h, + vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h, bd); + vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h, bd); } void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h, int bd) { + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { int r; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; (void)bd; for (r = h; r > 0; --r) { @@ -560,15 +516,16 @@ void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h, int bd) { + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { int x, y; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; (void)bd; for (y = 0; y < h; ++y) { diff --git a/libvpx/vpx_dsp/vpx_convolve.h b/libvpx/vpx_dsp/vpx_convolve.h index 1aedd32bd..7979268a9 100644 --- a/libvpx/vpx_dsp/vpx_convolve.h +++ b/libvpx/vpx_dsp/vpx_convolve.h @@ -19,15 +19,15 @@ extern "C" { typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #if CONFIG_VP9_HIGHBITDEPTH typedef void (*highbd_convolve_fn_t)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd); #endif diff --git a/libvpx/vpx_dsp/vpx_dsp.mk b/libvpx/vpx_dsp/vpx_dsp.mk index 6ac7182ab..3b1a873cd 100644 --- a/libvpx/vpx_dsp/vpx_dsp.mk +++ b/libvpx/vpx_dsp/vpx_dsp.mk @@ -50,12 +50,13 @@ DSP_SRCS-yes += intrapred.c DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm -DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm DSP_SRCS-$(HAVE_VSX) += ppc/intrapred_vsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_intrin_sse2.c +DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_intrin_ssse3.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_intrapred_neon.c endif # CONFIG_VP9_HIGHBITDEPTH @@ -87,6 +88,8 @@ DSP_SRCS-yes += vpx_filter.h DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c +DSP_SRCS-$(HAVE_SSSE3) += x86/convolve_ssse3.h +DSP_SRCS-$(HAVE_AVX2) += x86/convolve_avx2.h DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_8t_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_bilinear_sse2.asm DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm @@ -104,6 +107,7 @@ DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_neon.c endif DSP_SRCS-$(HAVE_SSE2) += x86/vpx_convolve_copy_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/vpx_scaled_convolve8_neon.c ifeq ($(HAVE_NEON_ASM),yes) DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM) @@ -194,6 +198,9 @@ endif DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h DSP_SRCS-$(HAVE_NEON) += arm/fdct_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct16x16_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct32x32_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct_partial_neon.c DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c @@ -207,10 +214,13 @@ DSP_SRCS-yes += inv_txfm.c DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.h DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.c DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM) +DSP_SRCS-$(HAVE_VSX) += ppc/inv_txfm_vsx.c + ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_MSA) += mips/inv_txfm_msa.h DSP_SRCS-$(HAVE_MSA) += mips/idct4x4_msa.c @@ -237,6 +247,11 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct4x4_add_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct8x8_add_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct16x16_add_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct32x32_add_sse2.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_inv_txfm_sse4.h +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct4x4_add_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct8x8_add_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct16x16_add_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct32x32_add_sse4.c endif # !CONFIG_VP9_HIGHBITDEPTH ifeq ($(HAVE_NEON_ASM),yes) @@ -264,18 +279,19 @@ ifeq ($(CONFIG_VP9_ENCODER),yes) DSP_SRCS-yes += quantize.c DSP_SRCS-yes += quantize.h +DSP_SRCS-$(HAVE_SSE2) += x86/quantize_x86.h DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c +DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.c +DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c +DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c endif -ifeq ($(ARCH_X86_64),yes) -DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm -DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx_x86_64.asm -endif # avg DSP_SRCS-yes += avg.c DSP_SRCS-$(HAVE_SSE2) += x86/avg_intrin_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/avg_intrin_avx2.c DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c @@ -286,6 +302,10 @@ DSP_SRCS-$(HAVE_VSX) += ppc/hadamard_vsx.c endif # CONFIG_VP9_ENCODER +# skin detection +DSP_SRCS-yes += skin_detection.h +DSP_SRCS-yes += skin_detection.c + ifeq ($(CONFIG_ENCODERS),yes) DSP_SRCS-yes += sad.c DSP_SRCS-yes += subtract.c @@ -300,11 +320,15 @@ DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c +DSP_SRCS-$(HAVE_MMI) += mips/sad_mmi.c +DSP_SRCS-$(HAVE_MMI) += mips/subtract_mmi.c + DSP_SRCS-$(HAVE_SSE3) += x86/sad_sse3.asm DSP_SRCS-$(HAVE_SSSE3) += x86/sad_ssse3.asm DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c +DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c DSP_SRCS-$(HAVE_SSE) += x86/sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE) += x86/sad_sse2.asm @@ -325,17 +349,19 @@ ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC) DSP_SRCS-yes += variance.c DSP_SRCS-yes += variance.h +DSP_SRCS-$(HAVE_NEON) += arm/avg_pred_neon.c DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c +DSP_SRCS-$(HAVE_MMI) += mips/variance_mmi.c + DSP_SRCS-$(HAVE_SSE) += x86/variance_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/avg_pred_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3 DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c -DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c DSP_SRCS-$(HAVE_VSX) += ppc/variance_vsx.c ifeq ($(ARCH_X86_64),yes) @@ -354,7 +380,9 @@ endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC # Neon utilities DSP_SRCS-$(HAVE_NEON) += arm/mem_neon.h +DSP_SRCS-$(HAVE_NEON) += arm/sum_neon.h DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h +DSP_SRCS-$(HAVE_NEON) += arm/vpx_convolve8_neon.h # PPC VSX utilities DSP_SRCS-$(HAVE_VSX) += ppc/types_vsx.h @@ -362,6 +390,7 @@ DSP_SRCS-$(HAVE_VSX) += ppc/transpose_vsx.h DSP_SRCS-$(HAVE_VSX) += ppc/bitdepth_conversion_vsx.h # X86 utilities +DSP_SRCS-$(HAVE_SSE2) += x86/mem_sse2.h DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes) diff --git a/libvpx/vpx_dsp/vpx_dsp_common.h b/libvpx/vpx_dsp/vpx_dsp_common.h index 49d36e545..c8c852374 100644 --- a/libvpx/vpx_dsp/vpx_dsp_common.h +++ b/libvpx/vpx_dsp/vpx_dsp_common.h @@ -43,6 +43,8 @@ typedef int32_t tran_high_t; typedef int16_t tran_low_t; #endif // CONFIG_VP9_HIGHBITDEPTH +typedef int16_t tran_coef_t; + static INLINE uint8_t clip_pixel(int val) { return (val > 255) ? 255 : (val < 0) ? 0 : val; } @@ -55,7 +57,6 @@ static INLINE double fclamp(double value, double low, double high) { return value < low ? low : (value > high ? high : value); } -#if CONFIG_VP9_HIGHBITDEPTH static INLINE uint16_t clip_pixel_highbd(int val, int bd) { switch (bd) { case 8: @@ -64,7 +65,6 @@ static INLINE uint16_t clip_pixel_highbd(int val, int bd) { case 12: return (uint16_t)clamp(val, 0, 4095); } } -#endif // CONFIG_VP9_HIGHBITDEPTH #ifdef __cplusplus } // extern "C" diff --git a/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl index c67483641..1a743d910 100644 --- a/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1,3 +1,13 @@ +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + sub vpx_dsp_forward_decls() { print <<EOF /* @@ -6,6 +16,7 @@ print <<EOF #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" EOF } @@ -19,6 +30,7 @@ if ($opts{arch} eq "x86_64") { $ssse3_x86_64 = 'ssse3'; $avx_x86_64 = 'avx'; $avx2_x86_64 = 'avx2'; + $avx512_x86_64 = 'avx512'; } # @@ -188,21 +200,25 @@ specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2 vsx/; # High bitdepth functions if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d207_predictor_4x4 sse2/; add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d45_predictor_4x4 neon/; + specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d63_predictor_4x4 sse2/; add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_h_predictor_4x4 neon/; + specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d117_predictor_4x4 sse2/; add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d135_predictor_4x4 neon/; + specialize qw/vpx_highbd_d135_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d153_predictor_4x4 sse2/; add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/; @@ -214,30 +230,34 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_dc_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_top_predictor_4x4 neon/; + specialize qw/vpx_highbd_dc_top_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_left_predictor_4x4 neon/; + specialize qw/vpx_highbd_dc_left_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_128_predictor_4x4 neon/; + specialize qw/vpx_highbd_dc_128_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d207_predictor_8x8 ssse3/; add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d45_predictor_8x8 neon/; + specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/; add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_h_predictor_8x8 neon/; + specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/; add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d117_predictor_8x8 ssse3/; add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d135_predictor_8x8 neon/; + specialize qw/vpx_highbd_d135_predictor_8x8 neon ssse3/; add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d153_predictor_8x8 ssse3/; add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/; @@ -249,30 +269,34 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_dc_predictor_8x8 neon sse2/; add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_top_predictor_8x8 neon/; + specialize qw/vpx_highbd_dc_top_predictor_8x8 neon sse2/; add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_left_predictor_8x8 neon/; + specialize qw/vpx_highbd_dc_left_predictor_8x8 neon sse2/; add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_128_predictor_8x8 neon/; + specialize qw/vpx_highbd_dc_128_predictor_8x8 neon sse2/; add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d207_predictor_16x16 ssse3/; add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d45_predictor_16x16 neon/; + specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/; add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_h_predictor_16x16 neon/; + specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/; add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d117_predictor_16x16 ssse3/; add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d135_predictor_16x16 neon/; + specialize qw/vpx_highbd_d135_predictor_16x16 neon ssse3/; add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d153_predictor_16x16 ssse3/; add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/; @@ -284,30 +308,34 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_dc_predictor_16x16 neon sse2/; add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_top_predictor_16x16 neon/; + specialize qw/vpx_highbd_dc_top_predictor_16x16 neon sse2/; add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_left_predictor_16x16 neon/; + specialize qw/vpx_highbd_dc_left_predictor_16x16 neon sse2/; add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_128_predictor_16x16 neon/; + specialize qw/vpx_highbd_dc_128_predictor_16x16 neon sse2/; add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d207_predictor_32x32 ssse3/; add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d45_predictor_32x32 neon/; + specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/; add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_h_predictor_32x32 neon/; + specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/; add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d117_predictor_32x32 ssse3/; add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d135_predictor_32x32 neon/; + specialize qw/vpx_highbd_d135_predictor_32x32 neon ssse3/; add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d153_predictor_32x32 ssse3/; add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/; @@ -319,81 +347,81 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_dc_predictor_32x32 neon sse2/; add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_top_predictor_32x32 neon/; + specialize qw/vpx_highbd_dc_top_predictor_32x32 neon sse2/; add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_left_predictor_32x32 neon/; + specialize qw/vpx_highbd_dc_left_predictor_32x32 neon sse2/; add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_128_predictor_32x32 neon/; + specialize qw/vpx_highbd_dc_128_predictor_32x32 neon sse2/; } # CONFIG_VP9_HIGHBITDEPTH # # Sub Pixel Filters # -add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx/; -add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx/; -add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx/; -add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/; -add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx/; -add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa vsx/; +add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; +specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx/; -add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa vsx/; +add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; +specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/; -add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa vsx/; +add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; +specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx/; -add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_scaled_2d ssse3/; +add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; +specialize qw/vpx_scaled_2d ssse3 neon msa/; -add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Sub Pixel Filters # - add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/; - add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/; - add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64"; } # CONFIG_VP9_HIGHBITDEPTH @@ -487,28 +515,28 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_fdct4x4 neon sse2/; add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct4x4_1 sse2/; + specialize qw/vpx_fdct4x4_1 sse2 neon/; add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct8x8 neon sse2/; add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct8x8_1 neon sse2/; + specialize qw/vpx_fdct8x8_1 neon sse2 msa/; add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct16x16 sse2/; + specialize qw/vpx_fdct16x16 neon sse2/; add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct16x16_1 sse2/; + specialize qw/vpx_fdct16x16_1 sse2 neon/; add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct32x32 sse2/; + specialize qw/vpx_fdct32x32 neon sse2/; add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct32x32_rd sse2/; + specialize qw/vpx_fdct32x32_rd neon sse2/; add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct32x32_1 sse2/; + specialize qw/vpx_fdct32x32_1 sse2 neon/; add_proto qw/void vpx_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_highbd_fdct4x4 sse2/; @@ -517,6 +545,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_fdct8x8 sse2/; add_proto qw/void vpx_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vpx_highbd_fdct8x8_1 neon/; + $vpx_highbd_fdct8x8_1_neon=vpx_fdct8x8_1_neon; add_proto qw/void vpx_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_highbd_fdct16x16 sse2/; @@ -535,7 +565,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_fdct4x4 neon sse2 msa/; add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct4x4_1 sse2/; + specialize qw/vpx_fdct4x4_1 sse2 neon/; add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct8x8 sse2 neon msa/, "$ssse3_x86_64"; @@ -544,19 +574,19 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_fdct8x8_1 sse2 neon msa/; add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct16x16 sse2 msa/; + specialize qw/vpx_fdct16x16 neon sse2 msa/; add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct16x16_1 sse2 msa/; + specialize qw/vpx_fdct16x16_1 sse2 neon msa/; add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct32x32 sse2 avx2 msa/; + specialize qw/vpx_fdct32x32 neon sse2 avx2 msa/; add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct32x32_rd sse2 avx2 msa/; + specialize qw/vpx_fdct32x32_rd sse2 avx2 neon msa/; add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct32x32_1 sse2 msa/; + specialize qw/vpx_fdct32x32_1 sse2 neon msa/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER @@ -581,25 +611,24 @@ add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { - # Note that there are more specializations appended when CONFIG_VP9_HIGHBITDEPTH is off. - specialize qw/vpx_idct4x4_16_add neon sse2/; + # Note that there are more specializations appended when + # CONFIG_VP9_HIGHBITDEPTH is off. + specialize qw/vpx_idct4x4_16_add neon sse2 vsx/; specialize qw/vpx_idct4x4_1_add neon sse2/; - specialize qw/vpx_idct8x8_64_add neon sse2 ssse3/; + specialize qw/vpx_idct8x8_64_add neon sse2 vsx/; specialize qw/vpx_idct8x8_12_add neon sse2 ssse3/; specialize qw/vpx_idct8x8_1_add neon sse2/; - specialize qw/vpx_idct16x16_256_add neon sse2/; + specialize qw/vpx_idct16x16_256_add neon sse2 vsx/; specialize qw/vpx_idct16x16_38_add neon sse2/; - $vpx_idct16x16_38_add_sse2=vpx_idct16x16_256_add_sse2; specialize qw/vpx_idct16x16_10_add neon sse2/; specialize qw/vpx_idct16x16_1_add neon sse2/; - specialize qw/vpx_idct32x32_1024_add neon sse2 ssse3/; + specialize qw/vpx_idct32x32_1024_add neon sse2 vsx/; specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/; - $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2; specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/; specialize qw/vpx_idct32x32_1_add neon sse2/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { - # Note that these specializations appends to the above ones. + # Note that these specializations are appended to the above ones. specialize qw/vpx_idct4x4_16_add dspr2 msa/; specialize qw/vpx_idct4x4_1_add dspr2 msa/; specialize qw/vpx_idct8x8_64_add dspr2 msa/; @@ -652,16 +681,15 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { - specialize qw/vpx_highbd_idct4x4_16_add neon sse2/; - specialize qw/vpx_highbd_idct8x8_64_add neon sse2/; - specialize qw/vpx_highbd_idct8x8_12_add neon sse2/; - specialize qw/vpx_highbd_idct16x16_256_add neon sse2/; - specialize qw/vpx_highbd_idct16x16_38_add neon sse2/; - $vpx_highbd_idct16x16_38_add_sse2=vpx_highbd_idct16x16_256_add_sse2; - specialize qw/vpx_highbd_idct16x16_10_add neon sse2/; - specialize qw/vpx_highbd_idct32x32_1024_add neon/; - specialize qw/vpx_highbd_idct32x32_135_add neon/; - specialize qw/vpx_highbd_idct32x32_34_add neon/; + specialize qw/vpx_highbd_idct4x4_16_add neon sse2 sse4_1/; + specialize qw/vpx_highbd_idct8x8_64_add neon sse2 sse4_1/; + specialize qw/vpx_highbd_idct8x8_12_add neon sse2 sse4_1/; + specialize qw/vpx_highbd_idct16x16_256_add neon sse2 sse4_1/; + specialize qw/vpx_highbd_idct16x16_38_add neon sse2 sse4_1/; + specialize qw/vpx_highbd_idct16x16_10_add neon sse2 sse4_1/; + specialize qw/vpx_highbd_idct32x32_1024_add neon sse2 sse4_1/; + specialize qw/vpx_highbd_idct32x32_135_add neon sse2 sse4_1/; + specialize qw/vpx_highbd_idct32x32_34_add neon sse2 sse4_1/; } # !CONFIG_EMULATE_HARDWARE } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9 @@ -671,10 +699,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64"; + specialize qw/vpx_quantize_b neon sse2 ssse3 avx/; add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64"; + specialize qw/vpx_quantize_b_32x32 neon ssse3 avx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; @@ -690,49 +718,49 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes") { # Block subtraction # add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; -specialize qw/vpx_subtract_block neon msa sse2/; +specialize qw/vpx_subtract_block neon msa mmi sse2/; # # Single block SAD # add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad64x64 avx2 neon msa sse2 vsx/; +specialize qw/vpx_sad64x64 neon avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad64x32 avx2 msa sse2 vsx/; +specialize qw/vpx_sad64x32 neon avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad32x64 avx2 msa sse2 vsx/; +specialize qw/vpx_sad32x64 neon avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad32x32 avx2 neon msa sse2 vsx/; +specialize qw/vpx_sad32x32 neon avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad32x16 avx2 msa sse2 vsx/; +specialize qw/vpx_sad32x16 neon avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad16x32 msa sse2 vsx/; +specialize qw/vpx_sad16x32 neon msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad16x16 neon msa sse2 vsx/; +specialize qw/vpx_sad16x16 neon msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad16x8 neon msa sse2 vsx/; +specialize qw/vpx_sad16x8 neon msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad8x16 neon msa sse2/; +specialize qw/vpx_sad8x16 neon msa sse2 mmi/; add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad8x8 neon msa sse2/; +specialize qw/vpx_sad8x8 neon msa sse2 mmi/; add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad8x4 msa sse2/; +specialize qw/vpx_sad8x4 neon msa sse2 mmi/; add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad4x8 msa sse2/; +specialize qw/vpx_sad4x8 neon msa sse2 mmi/; add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad4x4 neon msa sse2/; +specialize qw/vpx_sad4x4 neon msa sse2 mmi/; # # Avg @@ -748,23 +776,23 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_minmax_8x8 sse2 neon msa/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff"; + add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; specialize qw/vpx_hadamard_8x8 sse2 neon vsx/, "$ssse3_x86_64"; - add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff"; - specialize qw/vpx_hadamard_16x16 sse2 neon vsx/; + add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; + specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/; add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length"; - specialize qw/vpx_satd sse2 neon/; + specialize qw/vpx_satd avx2 sse2 neon/; } else { - add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; + add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64"; - add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; - specialize qw/vpx_hadamard_16x16 sse2 neon msa vsx/; + add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; + specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/; add_proto qw/int vpx_satd/, "const int16_t *coeff, int length"; - specialize qw/vpx_satd sse2 neon msa/; + specialize qw/vpx_satd avx2 sse2 neon msa/; } add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height"; @@ -778,138 +806,120 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { } # CONFIG_VP9_ENCODER add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad64x64_avg avx2 msa sse2/; +specialize qw/vpx_sad64x64_avg neon avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad64x32_avg avx2 msa sse2/; +specialize qw/vpx_sad64x32_avg neon avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad32x64_avg avx2 msa sse2/; +specialize qw/vpx_sad32x64_avg neon avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad32x32_avg avx2 msa sse2/; +specialize qw/vpx_sad32x32_avg neon avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad32x16_avg avx2 msa sse2/; +specialize qw/vpx_sad32x16_avg neon avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad16x32_avg msa sse2/; +specialize qw/vpx_sad16x32_avg neon msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad16x16_avg msa sse2/; +specialize qw/vpx_sad16x16_avg neon msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad16x8_avg msa sse2/; +specialize qw/vpx_sad16x8_avg neon msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad8x16_avg msa sse2/; +specialize qw/vpx_sad8x16_avg neon msa sse2 mmi/; add_proto qw/unsigned int vpx_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad8x8_avg msa sse2/; +specialize qw/vpx_sad8x8_avg neon msa sse2 mmi/; add_proto qw/unsigned int vpx_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad8x4_avg msa sse2/; +specialize qw/vpx_sad8x4_avg neon msa sse2 mmi/; add_proto qw/unsigned int vpx_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad4x8_avg msa sse2/; +specialize qw/vpx_sad4x8_avg neon msa sse2 mmi/; add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad4x4_avg msa sse2/; +specialize qw/vpx_sad4x4_avg neon msa sse2 mmi/; # # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally # # Blocks of 3 -add_proto qw/void vpx_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad64x64x3 msa/; - -add_proto qw/void vpx_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad32x32x3 msa/; - add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad16x16x3 sse3 ssse3 msa/; +specialize qw/vpx_sad16x16x3 sse3 ssse3 msa mmi/; add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad16x8x3 sse3 ssse3 msa/; +specialize qw/vpx_sad16x8x3 sse3 ssse3 msa mmi/; add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad8x16x3 sse3 msa/; +specialize qw/vpx_sad8x16x3 sse3 msa mmi/; add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad8x8x3 sse3 msa/; +specialize qw/vpx_sad8x8x3 sse3 msa mmi/; add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad4x4x3 sse3 msa/; +specialize qw/vpx_sad4x4x3 sse3 msa mmi/; # Blocks of 8 -add_proto qw/void vpx_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad64x64x8 msa/; - -add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad32x32x8 msa/; - add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad16x16x8 sse4_1 msa/; +specialize qw/vpx_sad16x16x8 sse4_1 msa mmi/; add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad16x8x8 sse4_1 msa/; +specialize qw/vpx_sad16x8x8 sse4_1 msa mmi/; add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad8x16x8 sse4_1 msa/; +specialize qw/vpx_sad8x16x8 sse4_1 msa mmi/; add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad8x8x8 sse4_1 msa/; - -add_proto qw/void vpx_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad8x4x8 msa/; - -add_proto qw/void vpx_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad4x8x8 msa/; +specialize qw/vpx_sad8x8x8 sse4_1 msa mmi/; add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad4x4x8 sse4_1 msa/; +specialize qw/vpx_sad4x4x8 sse4_1 msa mmi/; # # Multi-block SAD, comparing a reference to N independent blocks # add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad64x64x4d avx2 neon msa sse2/; +specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi/; add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad64x32x4d msa sse2/; +specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/; add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad32x64x4d msa sse2/; +specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/; add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad32x32x4d avx2 neon msa sse2/; +specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi/; add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad32x16x4d msa sse2/; +specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/; add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad16x32x4d msa sse2/; +specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/; add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad16x16x4d neon msa sse2/; +specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi/; add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad16x8x4d msa sse2/; +specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/; add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad8x16x4d msa sse2/; +specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/; add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad8x8x4d msa sse2/; +specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/; add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad8x4x4d msa sse2/; +specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/; add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad4x8x4d msa sse2/; +specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/; add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad4x4x4d msa sse2/; +specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/; add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size"; specialize qw/vpx_sum_squares_2d_i16 sse2 msa/; @@ -1016,43 +1026,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; # - # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally - # - # Blocks of 3 - add_proto qw/void vpx_highbd_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - - add_proto qw/void vpx_highbd_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - - add_proto qw/void vpx_highbd_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - - add_proto qw/void vpx_highbd_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - - add_proto qw/void vpx_highbd_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - - add_proto qw/void vpx_highbd_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - - add_proto qw/void vpx_highbd_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - - # Blocks of 8 - add_proto qw/void vpx_highbd_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - - add_proto qw/void vpx_highbd_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - - add_proto qw/void vpx_highbd_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - - add_proto qw/void vpx_highbd_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - - add_proto qw/void vpx_highbd_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - - add_proto qw/void vpx_highbd_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - - add_proto qw/void vpx_highbd_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - - add_proto qw/void vpx_highbd_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - - add_proto qw/void vpx_highbd_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - - # # Multi-block SAD, comparing a reference to N independent blocks # add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; @@ -1109,43 +1082,43 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq " # Variance # add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance64x64 sse2 avx2 neon msa/; + specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi/; add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance64x32 sse2 avx2 neon msa/; + specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi/; add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance32x64 sse2 neon msa/; + specialize qw/vpx_variance32x64 sse2 neon msa mmi/; add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance32x32 sse2 avx2 neon msa/; + specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi/; add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance32x16 sse2 avx2 neon msa/; + specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi/; add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance16x32 sse2 neon msa/; + specialize qw/vpx_variance16x32 sse2 neon msa mmi/; add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance16x16 sse2 avx2 neon msa/; + specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi/; add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance16x8 sse2 neon msa/; + specialize qw/vpx_variance16x8 sse2 neon msa mmi/; add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x16 sse2 neon msa/; + specialize qw/vpx_variance8x16 sse2 neon msa mmi/; add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x8 sse2 neon msa/; + specialize qw/vpx_variance8x8 sse2 neon msa mmi/; add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x4 sse2 neon msa/; + specialize qw/vpx_variance8x4 sse2 neon msa mmi/; add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance4x8 sse2 neon msa/; + specialize qw/vpx_variance4x8 sse2 neon msa mmi/; add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance4x4 sse2 neon msa/; + specialize qw/vpx_variance4x4 sse2 neon msa mmi/; # # Specialty Variance @@ -1157,16 +1130,16 @@ add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, co specialize qw/vpx_get8x8var sse2 neon msa/; add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vpx_mse16x16 sse2 avx2 neon msa/; + specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi/; add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vpx_mse16x8 sse2 msa/; + specialize qw/vpx_mse16x8 sse2 msa mmi/; add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vpx_mse8x16 sse2 msa/; + specialize qw/vpx_mse8x16 sse2 msa mmi/; add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vpx_mse8x8 sse2 msa/; + specialize qw/vpx_mse8x8 sse2 msa mmi/; add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *"; specialize qw/vpx_get_mb_ss sse2 msa vsx/; @@ -1175,88 +1148,88 @@ add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int specialize qw/vpx_get4x4sse_cs neon msa vsx/; add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; - specialize qw/vpx_comp_avg_pred sse2 vsx/; + specialize qw/vpx_comp_avg_pred neon sse2 vsx/; # # Subpixel Variance # add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance64x32 neon msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance64x32 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance32x64 neon msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance32x64 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance32x16 neon msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance32x16 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance16x32 neon msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance16x32 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance16x16 neon msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance16x16 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance16x8 neon msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance16x8 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance8x16 neon msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance8x16 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance8x8 neon msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance8x8 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance8x4 neon msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance8x4 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance4x8 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance4x8 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance4x4 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance4x4 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_sub_pixel_avg_variance64x32 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_avg_variance64x32 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_sub_pixel_avg_variance32x64 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_avg_variance32x64 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_avg_variance32x32 neon avx2 msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_sub_pixel_avg_variance32x16 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_avg_variance32x16 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_sub_pixel_avg_variance16x32 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_avg_variance16x32 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_sub_pixel_avg_variance16x16 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_avg_variance16x16 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_sub_pixel_avg_variance16x8 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_avg_variance16x8 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_sub_pixel_avg_variance8x16 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_avg_variance8x16 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_sub_pixel_avg_variance8x8 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_avg_variance8x8 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_sub_pixel_avg_variance8x4 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_avg_variance8x4 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_sub_pixel_avg_variance4x8 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_avg_variance4x8 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_sub_pixel_avg_variance4x4 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_avg_variance4x4 neon msa mmi sse2 ssse3/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; diff --git a/libvpx/vpx_dsp/vpx_filter.h b/libvpx/vpx_dsp/vpx_filter.h index 26d690501..6cea251bc 100644 --- a/libvpx/vpx_dsp/vpx_filter.h +++ b/libvpx/vpx_dsp/vpx_filter.h @@ -26,17 +26,6 @@ extern "C" { typedef int16_t InterpKernel[SUBPEL_TAPS]; -static INLINE const InterpKernel *get_filter_base(const int16_t *filter) { - // NOTE: This assumes that the filter table is 256-byte aligned. - // TODO(agrange) Modify to make independent of table alignment. - return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); -} - -static INLINE int get_filter_offset(const int16_t *f, - const InterpKernel *base) { - return (int)((const InterpKernel *)(intptr_t)f - base); -} - #ifdef __cplusplus } // extern "C" #endif diff --git a/libvpx/vpx_dsp/x86/add_noise_sse2.asm b/libvpx/vpx_dsp/x86/add_noise_sse2.asm index f758da22d..80cced4ce 100644 --- a/libvpx/vpx_dsp/x86/add_noise_sse2.asm +++ b/libvpx/vpx_dsp/x86/add_noise_sse2.asm @@ -11,6 +11,8 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise, ; int blackclamp, int whiteclamp, ; int width, int height, int pitch) @@ -26,13 +28,13 @@ sym(vpx_plane_add_noise_sse2): mov rdx, 0x01010101 mov rax, arg(2) mul rdx - movd xmm3, rax + movq xmm3, rax pshufd xmm3, xmm3, 0 ; xmm3 is 16 copies of char in blackclamp mov rdx, 0x01010101 mov rax, arg(3) mul rdx - movd xmm4, rax + movq xmm4, rax pshufd xmm4, xmm4, 0 ; xmm4 is 16 copies of char in whiteclamp movdqu xmm5, xmm3 ; both clamp = black clamp + white clamp diff --git a/libvpx/vpx_dsp/x86/avg_intrin_avx2.c b/libvpx/vpx_dsp/x86/avg_intrin_avx2.c new file mode 100644 index 000000000..ff19ea647 --- /dev/null +++ b/libvpx/vpx_dsp/x86/avg_intrin_avx2.c @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <immintrin.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/bitdepth_conversion_avx2.h" +#include "vpx_ports/mem.h" + +static void hadamard_col8x2_avx2(__m256i *in, int iter) { + __m256i a0 = in[0]; + __m256i a1 = in[1]; + __m256i a2 = in[2]; + __m256i a3 = in[3]; + __m256i a4 = in[4]; + __m256i a5 = in[5]; + __m256i a6 = in[6]; + __m256i a7 = in[7]; + + __m256i b0 = _mm256_add_epi16(a0, a1); + __m256i b1 = _mm256_sub_epi16(a0, a1); + __m256i b2 = _mm256_add_epi16(a2, a3); + __m256i b3 = _mm256_sub_epi16(a2, a3); + __m256i b4 = _mm256_add_epi16(a4, a5); + __m256i b5 = _mm256_sub_epi16(a4, a5); + __m256i b6 = _mm256_add_epi16(a6, a7); + __m256i b7 = _mm256_sub_epi16(a6, a7); + + a0 = _mm256_add_epi16(b0, b2); + a1 = _mm256_add_epi16(b1, b3); + a2 = _mm256_sub_epi16(b0, b2); + a3 = _mm256_sub_epi16(b1, b3); + a4 = _mm256_add_epi16(b4, b6); + a5 = _mm256_add_epi16(b5, b7); + a6 = _mm256_sub_epi16(b4, b6); + a7 = _mm256_sub_epi16(b5, b7); + + if (iter == 0) { + b0 = _mm256_add_epi16(a0, a4); + b7 = _mm256_add_epi16(a1, a5); + b3 = _mm256_add_epi16(a2, a6); + b4 = _mm256_add_epi16(a3, a7); + b2 = _mm256_sub_epi16(a0, a4); + b6 = _mm256_sub_epi16(a1, a5); + b1 = _mm256_sub_epi16(a2, a6); + b5 = _mm256_sub_epi16(a3, a7); + + a0 = _mm256_unpacklo_epi16(b0, b1); + a1 = _mm256_unpacklo_epi16(b2, b3); + a2 = _mm256_unpackhi_epi16(b0, b1); + a3 = _mm256_unpackhi_epi16(b2, b3); + a4 = _mm256_unpacklo_epi16(b4, b5); + a5 = _mm256_unpacklo_epi16(b6, b7); + a6 = _mm256_unpackhi_epi16(b4, b5); + a7 = _mm256_unpackhi_epi16(b6, b7); + + b0 = _mm256_unpacklo_epi32(a0, a1); + b1 = _mm256_unpacklo_epi32(a4, a5); + b2 = _mm256_unpackhi_epi32(a0, a1); + b3 = _mm256_unpackhi_epi32(a4, a5); + b4 = _mm256_unpacklo_epi32(a2, a3); + b5 = _mm256_unpacklo_epi32(a6, a7); + b6 = _mm256_unpackhi_epi32(a2, a3); + b7 = _mm256_unpackhi_epi32(a6, a7); + + in[0] = _mm256_unpacklo_epi64(b0, b1); + in[1] = _mm256_unpackhi_epi64(b0, b1); + in[2] = _mm256_unpacklo_epi64(b2, b3); + in[3] = _mm256_unpackhi_epi64(b2, b3); + in[4] = _mm256_unpacklo_epi64(b4, b5); + in[5] = _mm256_unpackhi_epi64(b4, b5); + in[6] = _mm256_unpacklo_epi64(b6, b7); + in[7] = _mm256_unpackhi_epi64(b6, b7); + } else { + in[0] = _mm256_add_epi16(a0, a4); + in[7] = _mm256_add_epi16(a1, a5); + in[3] = _mm256_add_epi16(a2, a6); + in[4] = _mm256_add_epi16(a3, a7); + in[2] = _mm256_sub_epi16(a0, a4); + in[6] = _mm256_sub_epi16(a1, a5); + in[1] = _mm256_sub_epi16(a2, a6); + in[5] = _mm256_sub_epi16(a3, a7); + } +} + +static void hadamard_8x8x2_avx2(int16_t const *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + __m256i src[8]; + src[0] = _mm256_loadu_si256((const __m256i *)src_diff); + src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + + hadamard_col8x2_avx2(src, 0); + hadamard_col8x2_avx2(src, 1); + + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[0], src[1], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[2], src[3], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[4], src[5], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[6], src[7], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[0], src[1], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[2], src[3], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[4], src[5], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[6], src[7], 0x31)); +} + +void vpx_hadamard_16x16_avx2(int16_t const *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); + int16_t *t_coeff = temp_coeff; +#else + int16_t *t_coeff = coeff; +#endif + + for (idx = 0; idx < 2; ++idx) { + int16_t const *src_ptr = src_diff + idx * 8 * src_stride; + hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2)); + } + + for (idx = 0; idx < 64; idx += 16) { + const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); + const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); + + __m256i b0 = _mm256_add_epi16(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); + __m256i b2 = _mm256_add_epi16(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); + + b0 = _mm256_srai_epi16(b0, 1); + b1 = _mm256_srai_epi16(b1, 1); + b2 = _mm256_srai_epi16(b2, 1); + b3 = _mm256_srai_epi16(b3, 1); + + store_tran_low(_mm256_add_epi16(b0, b2), coeff); + store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64); + store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128); + store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192); + + coeff += 16; + t_coeff += 16; + } +} + +int vpx_satd_avx2(const tran_low_t *coeff, int length) { + const __m256i one = _mm256_set1_epi16(1); + __m256i accum = _mm256_setzero_si256(); + int i; + + for (i = 0; i < length; i += 16) { + const __m256i src_line = load_tran_low(coeff); + const __m256i abs = _mm256_abs_epi16(src_line); + const __m256i sum = _mm256_madd_epi16(abs, one); + accum = _mm256_add_epi32(accum, sum); + coeff += 16; + } + + { // 32 bit horizontal add + const __m256i a = _mm256_srli_si256(accum, 8); + const __m256i b = _mm256_add_epi32(accum, a); + const __m256i c = _mm256_srli_epi64(b, 32); + const __m256i d = _mm256_add_epi32(b, c); + const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), + _mm256_extractf128_si256(d, 1)); + return _mm_cvtsi128_si32(accum_128); + } +} diff --git a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c index 4e89e07e5..a235ba41d 100644 --- a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c +++ b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c @@ -214,7 +214,7 @@ static void hadamard_col8_sse2(__m128i *in, int iter) { } } -void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, +void vpx_hadamard_8x8_sse2(int16_t const *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { __m128i src[8]; src[0] = _mm_load_si128((const __m128i *)src_diff); @@ -246,7 +246,7 @@ void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, store_tran_low(src[7], coeff); } -void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, +void vpx_hadamard_16x16_sse2(int16_t const *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int idx; for (idx = 0; idx < 4; ++idx) { diff --git a/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h b/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h index b9116f049..3552c07cd 100644 --- a/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h +++ b/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h @@ -21,10 +21,24 @@ static INLINE __m256i load_tran_low(const tran_low_t *a) { #if CONFIG_VP9_HIGHBITDEPTH const __m256i a_low = _mm256_loadu_si256((const __m256i *)a); - return _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8)); + const __m256i a_high = _mm256_loadu_si256((const __m256i *)(a + 8)); + return _mm256_packs_epi32(a_low, a_high); #else return _mm256_loadu_si256((const __m256i *)a); #endif } +static INLINE void store_tran_low(__m256i a, tran_low_t *b) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i one = _mm256_set1_epi16(1); + const __m256i a_hi = _mm256_mulhi_epi16(a, one); + const __m256i a_lo = _mm256_mullo_epi16(a, one); + const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi); + const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi); + _mm256_storeu_si256((__m256i *)b, a_1); + _mm256_storeu_si256((__m256i *)(b + 8), a_2); +#else + _mm256_storeu_si256((__m256i *)b, a); +#endif +} #endif // VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ diff --git a/libvpx/vpx_dsp/x86/convolve.h b/libvpx/vpx_dsp/x86/convolve.h index e69d6c617..68d7589d4 100644 --- a/libvpx/vpx_dsp/x86/convolve.h +++ b/libvpx/vpx_dsp/x86/convolve.h @@ -20,14 +20,15 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter); -#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ +#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt) \ void vpx_convolve8_##name##_##opt( \ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h) { \ - (void)filter_x; \ + ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ + const int16_t *filter = filter_kernel[offset]; \ + (void)x0_q4; \ (void)x_step_q4; \ - (void)filter_y; \ + (void)y0_q4; \ (void)y_step_q4; \ assert(filter[3] != 128); \ assert(step_q4 == 16); \ @@ -64,32 +65,36 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, } \ } -#define FUN_CONV_2D(avg, opt) \ - void vpx_convolve8_##avg##opt( \ - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h) { \ - assert(filter_x[3] != 128); \ - assert(filter_y[3] != 128); \ - assert(w <= 64); \ - assert(h <= 64); \ - assert(x_step_q4 == 16); \ - assert(y_step_q4 == 16); \ - if (filter_x[0] | filter_x[1] | filter_x[2]) { \ - DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \ - vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, \ - h + 7); \ - vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h); \ - } else { \ - DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \ - vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h + 1); \ - vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h); \ - } \ +#define FUN_CONV_2D(avg, opt) \ + void vpx_convolve8_##avg##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ + const int16_t *filter_x = filter[x0_q4]; \ + const int16_t *filter_y = filter[y0_q4]; \ + (void)filter_y; \ + assert(filter_x[3] != 128); \ + assert(filter_y[3] != 128); \ + assert(w <= 64); \ + assert(h <= 64); \ + assert(x_step_q4 == 16); \ + assert(y_step_q4 == 16); \ + if (filter_x[0] | filter_x[1] | filter_x[2]) { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \ + vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \ + h + 7); \ + vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ + filter, x0_q4, x_step_q4, y0_q4, \ + y_step_q4, w, h); \ + } else { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \ + vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4, \ + x_step_q4, y0_q4, y_step_q4, w, h + 1); \ + vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, w, \ + h); \ + } \ } #if CONFIG_VP9_HIGHBITDEPTH @@ -101,95 +106,97 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, unsigned int output_height, const int16_t *filter, int bd); -#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ - void vpx_highbd_convolve8_##name##_##opt( \ - const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ - if (step_q4 == 16 && filter[3] != 128) { \ - if (filter[0] | filter[1] | filter[2]) { \ - while (w >= 16) { \ - vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } else { \ - while (w >= 16) { \ - vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } \ - } \ - if (w) { \ - vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h, bd); \ - } \ - } - -#define HIGH_FUN_CONV_2D(avg, opt) \ - void vpx_highbd_convolve8_##avg##opt( \ +#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt) \ + void vpx_highbd_convolve8_##name##_##opt( \ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ - assert(w <= 64); \ - assert(h <= 64); \ - if (x_step_q4 == 16 && y_step_q4 == 16) { \ - if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ - vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ - fdata2, 64, filter_x, x_step_q4, \ - filter_y, y_step_q4, w, h + 7, bd); \ - vpx_highbd_convolve8_##avg##vert_##opt( \ - fdata2 + 192, 64, dst, dst_stride, filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h, bd); \ + ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ + const int16_t *filter = filter_kernel[offset]; \ + if (step_q4 == 16 && filter[3] != 128) { \ + if (filter[0] | filter[1] | filter[2]) { \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ } else { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ - vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h + 1, bd); \ - vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h, bd); \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ } \ - } else { \ - vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h, bd); \ } \ + if (w) { \ + vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ + filter_kernel, x0_q4, x_step_q4, y0_q4, \ + y_step_q4, w, h, bd); \ + } \ + } + +#define HIGH_FUN_CONV_2D(avg, opt) \ + void vpx_highbd_convolve8_##avg##opt( \ + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ + const int16_t *filter_x = filter[x0_q4]; \ + assert(w <= 64); \ + assert(h <= 64); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ + vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ + fdata2, 64, filter, x0_q4, x_step_q4, \ + y0_q4, y_step_q4, w, h + 7, bd); \ + vpx_highbd_convolve8_##avg##vert_##opt( \ + fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4, \ + y0_q4, y_step_q4, w, h, bd); \ + } else { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ + vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, \ + w, h + 1, bd); \ + vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ + filter, x0_q4, x_step_q4, \ + y0_q4, y_step_q4, w, h, bd); \ + } \ + } else { \ + vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, \ + bd); \ + } \ } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/vpx_dsp/x86/convolve_avx2.h b/libvpx/vpx_dsp/x86/convolve_avx2.h new file mode 100644 index 000000000..bc96b738f --- /dev/null +++ b/libvpx/vpx_dsp/x86/convolve_avx2.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_CONVOLVE_AVX2_H_ +#define VPX_DSP_X86_CONVOLVE_AVX2_H_ + +#include <immintrin.h> // AVX2 + +#include "./vpx_config.h" + +#if defined(__clang__) +#if (__clang_major__ > 0 && __clang_major__ < 3) || \ + (__clang_major__ == 3 && __clang_minor__ <= 3) || \ + (defined(__APPLE__) && defined(__apple_build_version__) && \ + ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ + (__clang_major__ == 5 && __clang_minor__ == 0))) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +#else // clang > 3.3, and not 5.0 on macosx. +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // clang <= 3.3 +#elif defined(__GNUC__) +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 +#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) +#else // gcc > 4.7 +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // gcc <= 4.6 +#else // !(gcc || clang) +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // __clang__ + +static INLINE void shuffle_filter_avx2(const int16_t *const filter, + __m256i *const f) { + const __m256i f_values = + MM256_BROADCASTSI128_SI256(_mm_load_si128((const __m128i *)filter)); + // pack and duplicate the filter values + f[0] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0200u)); + f[1] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0604u)); + f[2] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0a08u)); + f[3] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0e0cu)); +} + +static INLINE __m256i convolve8_16_avx2(const __m256i *const s, + const __m256i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m256i k_64 = _mm256_set1_epi16(1 << 6); + const __m256i x0 = _mm256_maddubs_epi16(s[0], f[0]); + const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]); + const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]); + const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]); + __m256i sum1, sum2; + + // sum the results together, saturating only on the final step + // adding x0 with x2 and x1 with x3 is the only order that prevents + // outranges for all filters + sum1 = _mm256_add_epi16(x0, x2); + sum2 = _mm256_add_epi16(x1, x3); + // add the rounding offset early to avoid another saturated add + sum1 = _mm256_add_epi16(sum1, k_64); + sum1 = _mm256_adds_epi16(sum1, sum2); + // round and shift by 7 bit each 16 bit + sum1 = _mm256_srai_epi16(sum1, 7); + return sum1; +} + +static INLINE __m128i convolve8_8_avx2(const __m256i *const s, + const __m256i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[0]), + _mm256_castsi256_si128(f[0])); + const __m128i x1 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[1]), + _mm256_castsi256_si128(f[1])); + const __m128i x2 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[2]), + _mm256_castsi256_si128(f[2])); + const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]), + _mm256_castsi256_si128(f[3])); + __m128i sum1, sum2; + + // sum the results together, saturating only on the final step + // adding x0 with x2 and x1 with x3 is the only order that prevents + // outranges for all filters + sum1 = _mm_add_epi16(x0, x2); + sum2 = _mm_add_epi16(x1, x3); + // add the rounding offset early to avoid another saturated add + sum1 = _mm_add_epi16(sum1, k_64); + sum1 = _mm_adds_epi16(sum1, sum2); + // shift by 7 bit each 16 bit + sum1 = _mm_srai_epi16(sum1, 7); + return sum1; +} + +#undef MM256_BROADCASTSI128_SI256 + +#endif // VPX_DSP_X86_CONVOLVE_AVX2_H_ diff --git a/libvpx/vpx_dsp/x86/convolve_ssse3.h b/libvpx/vpx_dsp/x86/convolve_ssse3.h new file mode 100644 index 000000000..e5d452f99 --- /dev/null +++ b/libvpx/vpx_dsp/x86/convolve_ssse3.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_CONVOLVE_SSSE3_H_ +#define VPX_DSP_X86_CONVOLVE_SSSE3_H_ + +#include <assert.h> +#include <tmmintrin.h> // SSSE3 + +#include "./vpx_config.h" + +static INLINE void shuffle_filter_ssse3(const int16_t *const filter, + __m128i *const f) { + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); + f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); + f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); + f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); +} + +static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter, + __m128i *const f) { + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + // It utilizes the fact that the high byte of filter[3] is always 0 to clean + // half of f[0] and f[4]. + assert(filter[3] >= 0 && filter[3] < 256); + f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u)); + f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u)); + f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u)); + f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au)); + f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu)); +} + +static INLINE __m128i convolve8_8_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + __m128i sum1, sum2; + + // sum the results together, saturating only on the final step + // adding x0 with x2 and x1 with x3 is the only order that prevents + // outranges for all filters + sum1 = _mm_add_epi16(x0, x2); + sum2 = _mm_add_epi16(x1, x3); + // add the rounding offset early to avoid another saturated add + sum1 = _mm_add_epi16(sum1, k_64); + sum1 = _mm_adds_epi16(sum1, sum2); + // shift by 7 bit each 16 bit + sum1 = _mm_srai_epi16(sum1, 7); + return sum1; +} + +static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + // compensate the subtracted 64 in f[1]. x4 is always non negative. + const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64)); + // add and saturate the results together + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, x1); + temp = _mm_adds_epi16(temp, x2); + temp = _mm_adds_epi16(temp, x4); + // round and shift by 7 bit each 16 bit + temp = _mm_adds_epi16(temp, k_64); + temp = _mm_srai_epi16(temp, 7); + return temp; +} + +static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]); + // compensate the subtracted 64 in f[2]. x5 is always non negative. + const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64)); + __m128i temp; + + // add and saturate the results together + temp = _mm_adds_epi16(x0, x1); + temp = _mm_adds_epi16(temp, x2); + temp = _mm_adds_epi16(temp, x3); + temp = _mm_adds_epi16(temp, x4); + temp = _mm_adds_epi16(temp, x5); + // round and shift by 7 bit each 16 bit + temp = _mm_adds_epi16(temp, k_64); + temp = _mm_srai_epi16(temp, 7); + return temp; +} + +#endif // VPX_DSP_X86_CONVOLVE_SSSE3_H_ diff --git a/libvpx/vpx_dsp/x86/deblock_sse2.asm b/libvpx/vpx_dsp/x86/deblock_sse2.asm index bd8fd1248..97cb43b67 100644 --- a/libvpx/vpx_dsp/x86/deblock_sse2.asm +++ b/libvpx/vpx_dsp/x86/deblock_sse2.asm @@ -83,6 +83,8 @@ add rbx, 16 %endmacro +SECTION .text + ;void vpx_post_proc_down_and_across_mb_row_sse2 ;( ; unsigned char *src_ptr, diff --git a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h index 39d3a3f59..132e06523 100644 --- a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h +++ b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h @@ -51,7 +51,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) { // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. - const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); + const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64); const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64); const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); diff --git a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h index 374433390..32b9bd281 100644 --- a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h +++ b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h @@ -63,7 +63,7 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) { // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); diff --git a/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h b/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h index 743e55e63..f9abaecf2 100644 --- a/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h +++ b/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h @@ -261,7 +261,7 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); @@ -582,7 +582,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); diff --git a/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm index b433874f2..32824a03a 100644 --- a/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm +++ b/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm @@ -31,8 +31,8 @@ SECTION .text INIT_XMM ssse3 cglobal fdct8x8, 3, 5, 13, input, output, stride - mova m8, [pd_8192] - mova m12, [pw_11585x2] + mova m8, [GLOBAL(pd_8192)] + mova m12, [GLOBAL(pw_11585x2)] lea r3, [2 * strideq] lea r4, [4 * strideq] @@ -92,10 +92,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride ; sin(pi / 8), cos(pi / 8) punpcklwd m2, m10, m9 punpckhwd m10, m9 - pmaddwd m5, m2, [pw_15137_6270] - pmaddwd m2, [pw_6270_m15137] - pmaddwd m9, m10, [pw_15137_6270] - pmaddwd m10, [pw_6270_m15137] + pmaddwd m5, m2, [GLOBAL(pw_15137_6270)] + pmaddwd m2, [GLOBAL(pw_6270_m15137)] + pmaddwd m9, m10, [GLOBAL(pw_15137_6270)] + pmaddwd m10, [GLOBAL(pw_6270_m15137)] paddd m5, m8 paddd m2, m8 paddd m9, m8 @@ -120,10 +120,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride ; sin(pi / 16), cos(pi / 16) punpcklwd m1, m10, m9 punpckhwd m10, m9 - pmaddwd m7, m1, [pw_16069_3196] - pmaddwd m1, [pw_3196_m16069] - pmaddwd m9, m10, [pw_16069_3196] - pmaddwd m10, [pw_3196_m16069] + pmaddwd m7, m1, [GLOBAL(pw_16069_3196)] + pmaddwd m1, [GLOBAL(pw_3196_m16069)] + pmaddwd m9, m10, [GLOBAL(pw_16069_3196)] + pmaddwd m10, [GLOBAL(pw_3196_m16069)] paddd m7, m8 paddd m1, m8 paddd m9, m8 @@ -138,10 +138,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride ; sin(3 * pi / 16), cos(3 * pi / 16) punpcklwd m11, m0, m3 punpckhwd m0, m3 - pmaddwd m9, m11, [pw_9102_13623] - pmaddwd m11, [pw_13623_m9102] - pmaddwd m3, m0, [pw_9102_13623] - pmaddwd m0, [pw_13623_m9102] + pmaddwd m9, m11, [GLOBAL(pw_9102_13623)] + pmaddwd m11, [GLOBAL(pw_13623_m9102)] + pmaddwd m3, m0, [GLOBAL(pw_9102_13623)] + pmaddwd m0, [GLOBAL(pw_13623_m9102)] paddd m9, m8 paddd m11, m8 paddd m3, m8 @@ -211,10 +211,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride ; stage 3 punpcklwd m6, m1, m3 punpckhwd m1, m3 - pmaddwd m2, m6, [pw_11585_11585] - pmaddwd m6, [pw_11585_m11585] - pmaddwd m3, m1, [pw_11585_11585] - pmaddwd m1, [pw_11585_m11585] + pmaddwd m2, m6, [GLOBAL(pw_11585_11585)] + pmaddwd m6, [GLOBAL(pw_11585_m11585)] + pmaddwd m3, m1, [GLOBAL(pw_11585_11585)] + pmaddwd m1, [GLOBAL(pw_11585_m11585)] paddd m2, m8 paddd m6, m8 paddd m3, m8 @@ -231,10 +231,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride punpcklwd m3, m5, m4 punpckhwd m5, m4 - pmaddwd m1, m3, [pw_15137_6270] - pmaddwd m3, [pw_6270_m15137] - pmaddwd m4, m5, [pw_15137_6270] - pmaddwd m5, [pw_6270_m15137] + pmaddwd m1, m3, [GLOBAL(pw_15137_6270)] + pmaddwd m3, [GLOBAL(pw_6270_m15137)] + pmaddwd m4, m5, [GLOBAL(pw_15137_6270)] + pmaddwd m5, [GLOBAL(pw_6270_m15137)] paddd m1, m8 paddd m3, m8 paddd m4, m8 @@ -255,10 +255,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride ; stage 4 punpcklwd m9, m5, m4 punpckhwd m5, m4 - pmaddwd m7, m9, [pw_16069_3196] - pmaddwd m9, [pw_3196_m16069] - pmaddwd m4, m5, [pw_16069_3196] - pmaddwd m5, [pw_3196_m16069] + pmaddwd m7, m9, [GLOBAL(pw_16069_3196)] + pmaddwd m9, [GLOBAL(pw_3196_m16069)] + pmaddwd m4, m5, [GLOBAL(pw_16069_3196)] + pmaddwd m5, [GLOBAL(pw_3196_m16069)] paddd m7, m8 paddd m9, m8 paddd m4, m8 @@ -272,10 +272,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride punpcklwd m4, m10, m0 punpckhwd m10, m0 - pmaddwd m5, m4, [pw_9102_13623] - pmaddwd m4, [pw_13623_m9102] - pmaddwd m0, m10, [pw_9102_13623] - pmaddwd m10, [pw_13623_m9102] + pmaddwd m5, m4, [GLOBAL(pw_9102_13623)] + pmaddwd m4, [GLOBAL(pw_13623_m9102)] + pmaddwd m0, m10, [GLOBAL(pw_9102_13623)] + pmaddwd m10, [GLOBAL(pw_13623_m9102)] paddd m5, m8 paddd m4, m8 paddd m0, m8 diff --git a/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c b/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c index 2fc7b7430..7e75d5d10 100644 --- a/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c +++ b/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c @@ -18,13 +18,14 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int width, int h, int bd) { - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; (void)bd; assert(width % 4 == 0); @@ -99,13 +100,14 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int width, int h, int bd) { - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; (void)bd; assert(width % 4 == 0); @@ -1073,8 +1075,8 @@ void vpx_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, #define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2 #define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2 -HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); -HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2); +HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2); HIGH_FUN_CONV_2D(, avx2); void vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t, @@ -1098,8 +1100,8 @@ void vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t, #define vpx_highbd_filter_block1d4_v2_avg_avx2 \ vpx_highbd_filter_block1d4_v2_avg_sse2 -HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, avx2); -HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2); +HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2); HIGH_FUN_CONV_2D(avg_, avx2); diff --git a/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c index a2412d124..f4f7235d1 100644 --- a/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c +++ b/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c @@ -8,237 +8,343 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <emmintrin.h> // SSE2 + #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" #include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" +static INLINE void highbd_idct16_4col_stage5(const __m128i *const in, + __m128i *const out) { + // stage 5 + out[0] = _mm_add_epi32(in[0], in[3]); + out[1] = _mm_add_epi32(in[1], in[2]); + out[2] = _mm_sub_epi32(in[1], in[2]); + out[3] = _mm_sub_epi32(in[0], in[3]); + highbd_butterfly_cospi16_sse2(in[6], in[5], &out[6], &out[5]); + out[8] = _mm_add_epi32(in[8], in[11]); + out[9] = _mm_add_epi32(in[9], in[10]); + out[10] = _mm_sub_epi32(in[9], in[10]); + out[11] = _mm_sub_epi32(in[8], in[11]); + out[12] = _mm_sub_epi32(in[15], in[12]); + out[13] = _mm_sub_epi32(in[14], in[13]); + out[14] = _mm_add_epi32(in[14], in[13]); + out[15] = _mm_add_epi32(in[15], in[12]); +} + +static INLINE void highbd_idct16_4col_stage6(const __m128i *const in, + __m128i *const out) { + out[0] = _mm_add_epi32(in[0], in[7]); + out[1] = _mm_add_epi32(in[1], in[6]); + out[2] = _mm_add_epi32(in[2], in[5]); + out[3] = _mm_add_epi32(in[3], in[4]); + out[4] = _mm_sub_epi32(in[3], in[4]); + out[5] = _mm_sub_epi32(in[2], in[5]); + out[6] = _mm_sub_epi32(in[1], in[6]); + out[7] = _mm_sub_epi32(in[0], in[7]); + out[8] = in[8]; + out[9] = in[9]; + highbd_butterfly_cospi16_sse2(in[13], in[10], &out[13], &out[10]); + highbd_butterfly_cospi16_sse2(in[12], in[11], &out[12], &out[11]); + out[14] = in[14]; + out[15] = in[15]; +} + +static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + highbd_butterfly_sse2(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_butterfly_sse2(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_butterfly_sse2(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_butterfly_sse2(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse2(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] + step1[11] = _mm_add_epi32(step2[10], step2[11]); + step1[12] = _mm_add_epi32(step2[13], step2[12]); + step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + + // stage 4 + highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]); + highbd_butterfly_sse2(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64, + &step2[13], &step2[10]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step1[4] = _mm_add_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step1[7] = _mm_add_epi32(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + __m128i temp1[2], sign[2]; + + // stage 2 + highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_neg_sse2(io[7], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_partial_butterfly_sse2(io[5], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_partial_butterfly_neg_sse2(io[6], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] + step1[11] = _mm_add_epi32(step2[10], step2[11]); + step1[12] = _mm_add_epi32(step2[13], step2[12]); + step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + + // stage 4 + abs_extend_64bit_sse2(io[0], temp1, sign); + step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64); + step2[1] = step2[0]; + highbd_partial_butterfly_sse2(io[4], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64, + &step2[13], &step2[10]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step1[4] = _mm_add_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step1[7] = _mm_add_epi32(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + __m128i temp[2], sign[2]; + + // stage 2 + highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = + _mm_sub_epi32(_mm_setzero_si128(), step2[11]); // step1[10] = -step1[10] + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = + _mm_sub_epi32(_mm_setzero_si128(), step2[12]); // step1[13] = -step1[13] + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + abs_extend_64bit_sse2(io[0], temp, sign); + step2[0] = multiplication_round_shift_sse2(temp, sign, cospi_16_64); + step2[1] = step2[0]; + step2[2] = _mm_setzero_si128(); + step2[3] = _mm_setzero_si128(); + highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64, + &step2[13], &step2[10]); + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - tran_low_t out[16 * 16]; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[32]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_set1_epi16(32); - const __m128i max = _mm_set1_epi16(3155); - const __m128i min = _mm_set1_epi16(-3155); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 16; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); - inptr[i + 16] = _mm_packs_epi32(temp1, temp2); - } + int i; + __m128i out[16], *in; - // Find the min & max for the row transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 32; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct16_sse2(inptr, inptr + 16); - - // Find the min & max for the column transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 32; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); + if (bd == 8) { + __m128i l[16], r[16]; + + in = l; + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]); + idct16_8col(in, in); + in = r; + input += 128; } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - array_transpose_16x16(inptr, inptr + 16); - for (i = 0; i < 16; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); - sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); - temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(l + i, out); + transpose_16bit_8x8(r + i, out + 8); + idct16_8col(out, out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; + dest += 8; } } else { - // Run the un-optimised row transform - for (i = 0; i < 16; ++i) { - vpx_highbd_idct16_c(input, outptr, bd); - input += 16; - outptr += 16; - } - } + __m128i all[4][16]; - if (optimised_cols) { - idct16_sse2(inptr, inptr + 16); - - // Final round & shift and Reconstruction and Store - { - __m128i d[2]; - for (i = 0; i < 16; i++) { - inptr[i] = _mm_add_epi16(inptr[i], rounding); - inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding); - d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); - inptr[i] = _mm_srai_epi16(inptr[i], 6); - inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); - d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); - d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); - _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); - } + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]); + highbd_idct16_4col(in); + input += 4 * 16; } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[16], temp_out[16]; - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - vpx_highbd_idct16_c(temp_in, temp_out, bd); + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct16_4col(out); + for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + highbd_write_buffer_4(dest + j * stride, out[j], bd); } + dest += 4; } } } -void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, +void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - tran_low_t out[16 * 16] = { 0 }; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[32]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_set1_epi16(32); - const __m128i max = _mm_set1_epi16(3155); - const __m128i min = _mm_set1_epi16(-3155); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 16; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); - inptr[i + 16] = _mm_packs_epi32(temp1, temp2); - } + int i; + __m128i out[16]; - // Find the min & max for the row transform - // Since all non-zero dct coefficients are in upper-left 4x4 area, - // we only need to consider first 4 rows here. - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 4; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform (N.B. This transposes inptr) - idct16_sse2(inptr, inptr + 16); - - // Find the min & max for the column transform - // N.B. Only first 4 cols contain non-zero coeffs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 16; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); + if (bd == 8) { + __m128i in[16], temp[16]; + + highbd_load_pack_transpose_32bit_8x8(input, 16, in); + for (i = 8; i < 16; i++) { + in[i] = _mm_setzero_si128(); } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - // Use fact only first 4 rows contain non-zero coeffs - array_transpose_8x8(inptr, inptr); - array_transpose_8x8(inptr + 8, inptr + 16); - for (i = 0; i < 4; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); - sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); - temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); + idct16_8col(in, temp); + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(temp + i, in); + idct16_8col(in, out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; + dest += 8; } } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vpx_highbd_idct16_c(input, outptr, bd); - input += 16; - outptr += 16; + __m128i all[2][16], *in; + + for (i = 0; i < 2; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(input, 16, in); + highbd_idct16x16_38_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + highbd_idct16x16_38_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; } } +} + +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + __m128i out[16]; + + if (bd == 8) { + __m128i in[16], l[16]; + + in[0] = load_pack_8_32bit(input + 0 * 16); + in[1] = load_pack_8_32bit(input + 1 * 16); + in[2] = load_pack_8_32bit(input + 2 * 16); + in[3] = load_pack_8_32bit(input + 3 * 16); - if (optimised_cols) { - idct16_sse2(inptr, inptr + 16); - - // Final round & shift and Reconstruction and Store - { - __m128i d[2]; - for (i = 0; i < 16; i++) { - inptr[i] = _mm_add_epi16(inptr[i], rounding); - inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding); - d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); - inptr[i] = _mm_srai_epi16(inptr[i], 6); - inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); - d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); - d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); - _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); + idct16x16_10_pass1(in, l); + + for (i = 0; i < 16; i += 8) { + int j; + idct16x16_10_pass2(l + i, in); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, in[j], bd); } + dest += 8; } } else { - // Run the un-optimised column transform - tran_low_t temp_in[16], temp_out[16]; - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - vpx_highbd_idct16_c(temp_in, temp_out, bd); + __m128i all[2][16], *in; + + for (i = 0; i < 2; i++) { + in = all[i]; + highbd_load_transpose_32bit_4x4(input, 16, in); + highbd_idct16x16_10_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(&all[0][i], out); + highbd_idct16x16_10_4col(out); + for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + highbd_write_buffer_4(dest + j * stride, out[j], bd); } + dest += 4; } } } diff --git a/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c new file mode 100644 index 000000000..de097c66a --- /dev/null +++ b/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c @@ -0,0 +1,349 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <smmintrin.h> // SSE4.1 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_idct16_4col_stage5(const __m128i *const in, + __m128i *const out) { + // stage 5 + out[0] = _mm_add_epi32(in[0], in[3]); + out[1] = _mm_add_epi32(in[1], in[2]); + out[2] = _mm_sub_epi32(in[1], in[2]); + out[3] = _mm_sub_epi32(in[0], in[3]); + highbd_butterfly_cospi16_sse4_1(in[6], in[5], &out[6], &out[5]); + out[8] = _mm_add_epi32(in[8], in[11]); + out[9] = _mm_add_epi32(in[9], in[10]); + out[10] = _mm_sub_epi32(in[9], in[10]); + out[11] = _mm_sub_epi32(in[8], in[11]); + out[12] = _mm_sub_epi32(in[15], in[12]); + out[13] = _mm_sub_epi32(in[14], in[13]); + out[14] = _mm_add_epi32(in[14], in[13]); + out[15] = _mm_add_epi32(in[15], in[12]); +} + +static INLINE void highbd_idct16_4col_stage6(const __m128i *const in, + __m128i *const out) { + out[0] = _mm_add_epi32(in[0], in[7]); + out[1] = _mm_add_epi32(in[1], in[6]); + out[2] = _mm_add_epi32(in[2], in[5]); + out[3] = _mm_add_epi32(in[3], in[4]); + out[4] = _mm_sub_epi32(in[3], in[4]); + out[5] = _mm_sub_epi32(in[2], in[5]); + out[6] = _mm_sub_epi32(in[1], in[6]); + out[7] = _mm_sub_epi32(in[0], in[7]); + out[8] = in[8]; + out[9] = in[9]; + highbd_butterfly_cospi16_sse4_1(in[13], in[10], &out[13], &out[10]); + highbd_butterfly_cospi16_sse4_1(in[12], in[11], &out[12], &out[11]); + out[14] = in[14]; + out[15] = in[15]; +} + +static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + highbd_butterfly_sse4_1(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_butterfly_sse4_1(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_butterfly_sse4_1(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_butterfly_sse4_1(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_butterfly_sse4_1(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse4_1(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[10] = _mm_sub_epi32(step2[11], step2[10]); + step1[11] = _mm_add_epi32(step2[11], step2[10]); + step1[12] = _mm_add_epi32(step2[12], step2[13]); + step1[13] = _mm_sub_epi32(step2[12], step2[13]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + + // stage 4 + highbd_butterfly_cospi16_sse4_1(io[0], io[8], &step2[0], &step2[1]); + highbd_butterfly_sse4_1(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64, + &step2[9], &step2[14]); + highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64, + &step2[13], &step2[10]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step1[4] = _mm_add_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step1[7] = _mm_add_epi32(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + __m128i temp1[2]; + + // stage 2 + highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_sse4_1(io[7], -cospi_18_64, cospi_14_64, &step2[9], + &step2[14]); + highbd_partial_butterfly_sse4_1(io[5], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_partial_butterfly_sse4_1(io[6], -cospi_20_64, cospi_12_64, &step1[5], + &step1[6]); + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[10] = _mm_sub_epi32(step2[11], step2[10]); + step1[11] = _mm_add_epi32(step2[11], step2[10]); + step1[12] = _mm_add_epi32(step2[12], step2[13]); + step1[13] = _mm_sub_epi32(step2[12], step2[13]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + + // stage 4 + extend_64bit(io[0], temp1); + step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64); + step2[1] = step2[0]; + highbd_partial_butterfly_sse4_1(io[4], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64, + &step2[9], &step2[14]); + highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64, + &step2[13], &step2[10]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step1[4] = _mm_add_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step1[7] = _mm_add_epi32(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + __m128i temp[2]; + + // stage 2 + highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + extend_64bit(io[0], temp); + step2[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64); + step2[1] = step2[0]; + step2[2] = _mm_setzero_si128(); + step2[3] = _mm_setzero_si128(); + highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64, + &step2[9], &step2[14]); + highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64, + &step2[13], &step2[10]); + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input, + uint16_t *dest, int stride, int bd) { + int i; + __m128i out[16], *in; + + if (bd == 8) { + __m128i l[16], r[16]; + + in = l; + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]); + idct16_8col(in, in); + in = r; + input += 128; + } + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(l + i, out); + transpose_16bit_8x8(r + i, out + 8); + idct16_8col(out, out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[4][16]; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]); + highbd_idct16_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct16_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct16x16_38_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + __m128i out[16]; + + if (bd == 8) { + __m128i in[16], temp[16]; + + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); + for (i = 8; i < 16; i++) { + in[i] = _mm_setzero_si128(); + } + idct16_8col(in, temp); + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(temp + i, in); + idct16_8col(in, out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[2][16], *in; + + for (i = 0; i < 2; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(input, 16, in); + highbd_idct16x16_38_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + highbd_idct16x16_38_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct16x16_10_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + __m128i out[16]; + + if (bd == 8) { + __m128i in[16], l[16]; + + in[0] = load_pack_8_32bit(input + 0 * 16); + in[1] = load_pack_8_32bit(input + 1 * 16); + in[2] = load_pack_8_32bit(input + 2 * 16); + in[3] = load_pack_8_32bit(input + 3 * 16); + + idct16x16_10_pass1(in, l); + + for (i = 0; i < 16; i += 8) { + int j; + idct16x16_10_pass2(l + i, in); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, in[j], bd); + } + dest += 8; + } + } else { + __m128i all[2][16], *in; + + for (i = 0; i < 2; i++) { + in = all[i]; + highbd_load_transpose_32bit_4x4(input, 16, in); + highbd_idct16x16_10_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(&all[0][i], out); + highbd_idct16x16_10_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} diff --git a/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c b/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c index 06f265918..c710e8995 100644 --- a/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c +++ b/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c @@ -14,6 +14,768 @@ #include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" +static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6( + __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) { + __m128i step2[32]; + + // stage 4 + step2[8] = step1[8]; + step2[15] = step1[15]; + highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64, + &step2[13], &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[8] = _mm_add_epi32(step2[8], step2[11]); + step1[9] = _mm_add_epi32(step2[9], step2[10]); + step1[10] = _mm_sub_epi32(step2[9], step2[10]); + step1[11] = _mm_sub_epi32(step2[8], step2[11]); + step1[12] = _mm_sub_epi32(step2[15], step2[12]); + step1[13] = _mm_sub_epi32(step2[14], step2[13]); + step1[14] = _mm_add_epi32(step2[14], step2[13]); + step1[15] = _mm_add_epi32(step2[15], step2[12]); + + // stage 6 + out[8] = step1[8]; + out[9] = step1[9]; + highbd_butterfly_sse2(step1[13], step1[10], cospi_16_64, cospi_16_64, + &out[10], &out[13]); + highbd_butterfly_sse2(step1[12], step1[11], cospi_16_64, cospi_16_64, + &out[11], &out[12]); + out[14] = step1[14]; + out[15] = step1[15]; +} + +static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7( + __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) { + __m128i step2[32]; + + // stage 4 + step2[16] = _mm_add_epi32(step1[16], step1[19]); + step2[17] = _mm_add_epi32(step1[17], step1[18]); + step2[18] = _mm_sub_epi32(step1[17], step1[18]); + step2[19] = _mm_sub_epi32(step1[16], step1[19]); + step2[20] = _mm_sub_epi32(step1[20], step1[23]); // step2[20] = -step2[20] + step2[21] = _mm_sub_epi32(step1[21], step1[22]); // step2[21] = -step2[21] + step2[22] = _mm_add_epi32(step1[21], step1[22]); + step2[23] = _mm_add_epi32(step1[20], step1[23]); + + step2[24] = _mm_add_epi32(step1[27], step1[24]); + step2[25] = _mm_add_epi32(step1[26], step1[25]); + step2[26] = _mm_sub_epi32(step1[26], step1[25]); // step2[26] = -step2[26] + step2[27] = _mm_sub_epi32(step1[27], step1[24]); // step2[27] = -step2[27] + step2[28] = _mm_sub_epi32(step1[31], step1[28]); + step2[29] = _mm_sub_epi32(step1[30], step1[29]); + step2[30] = _mm_add_epi32(step1[29], step1[30]); + step2[31] = _mm_add_epi32(step1[28], step1[31]); + + // stage 5 + step1[16] = step2[16]; + step1[17] = step2[17]; + highbd_butterfly_sse2(step2[29], step2[18], cospi_24_64, cospi_8_64, + &step1[18], &step1[29]); + highbd_butterfly_sse2(step2[28], step2[19], cospi_24_64, cospi_8_64, + &step1[19], &step1[28]); + highbd_butterfly_sse2(step2[20], step2[27], cospi_8_64, cospi_24_64, + &step1[27], &step1[20]); + highbd_butterfly_sse2(step2[21], step2[26], cospi_8_64, cospi_24_64, + &step1[26], &step1[21]); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[16] = _mm_add_epi32(step1[16], step1[23]); + step2[17] = _mm_add_epi32(step1[17], step1[22]); + step2[18] = _mm_add_epi32(step1[18], step1[21]); + step2[19] = _mm_add_epi32(step1[19], step1[20]); + step2[20] = _mm_sub_epi32(step1[19], step1[20]); + step2[21] = _mm_sub_epi32(step1[18], step1[21]); + step2[22] = _mm_sub_epi32(step1[17], step1[22]); + step2[23] = _mm_sub_epi32(step1[16], step1[23]); + + step2[24] = _mm_sub_epi32(step1[31], step1[24]); + step2[25] = _mm_sub_epi32(step1[30], step1[25]); + step2[26] = _mm_sub_epi32(step1[29], step1[26]); + step2[27] = _mm_sub_epi32(step1[28], step1[27]); + step2[28] = _mm_add_epi32(step1[27], step1[28]); + step2[29] = _mm_add_epi32(step1[26], step1[29]); + step2[30] = _mm_add_epi32(step1[25], step1[30]); + step2[31] = _mm_add_epi32(step1[24], step1[31]); + + // stage 7 + out[16] = step2[16]; + out[17] = step2[17]; + out[18] = step2[18]; + out[19] = step2[19]; + highbd_butterfly_sse2(step2[27], step2[20], cospi_16_64, cospi_16_64, + &out[20], &out[27]); + highbd_butterfly_sse2(step2[26], step2[21], cospi_16_64, cospi_16_64, + &out[21], &out[26]); + highbd_butterfly_sse2(step2[25], step2[22], cospi_16_64, cospi_16_64, + &out[22], &out[25]); + highbd_butterfly_sse2(step2[24], step2[23], cospi_16_64, cospi_16_64, + &out[23], &out[24]); + out[28] = step2[28]; + out[29] = step2[29]; + out[30] = step2[30]; + out[31] = step2[31]; +} + +// Group the coefficient calculation into smaller functions to prevent stack +// spillover in 32x32 idct optimizations: +// quarter_1: 0-7 +// quarter_2: 8-15 +// quarter_3_4: 16-23, 24-31 + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_butterfly_sse2(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse2(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + + // stage 4 + highbd_butterfly_sse2(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + highbd_butterfly_sse2(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_2( + const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_butterfly_sse2(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_butterfly_sse2(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_butterfly_sse2(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] + step1[11] = _mm_add_epi32(step2[10], step2[11]); + step1[12] = _mm_add_epi32(step2[13], step2[12]); + step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] + + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_1024_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_1024_4x32_quarter_1(in, temp); + highbd_idct32_1024_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_butterfly_sse2(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_butterfly_sse2(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], + &step1[30]); + highbd_butterfly_sse2(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], + &step1[29]); + highbd_butterfly_sse2(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], + &step1[28]); + + highbd_butterfly_sse2(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_butterfly_sse2(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], + &step1[26]); + + highbd_butterfly_sse2(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], + &step1[25]); + highbd_butterfly_sse2(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi32(step1[16], step1[17]); + step2[17] = _mm_sub_epi32(step1[16], step1[17]); + step2[18] = _mm_sub_epi32(step1[18], step1[19]); // step2[18] = -step2[18] + step2[19] = _mm_add_epi32(step1[18], step1[19]); + step2[20] = _mm_add_epi32(step1[20], step1[21]); + step2[21] = _mm_sub_epi32(step1[20], step1[21]); + step2[22] = _mm_sub_epi32(step1[22], step1[23]); // step2[22] = -step2[22] + step2[23] = _mm_add_epi32(step1[22], step1[23]); + + step2[24] = _mm_add_epi32(step1[25], step1[24]); + step2[25] = _mm_sub_epi32(step1[25], step1[24]); // step2[25] = -step2[25] + step2[26] = _mm_sub_epi32(step1[27], step1[26]); + step2[27] = _mm_add_epi32(step1[27], step1[26]); + step2[28] = _mm_add_epi32(step1[29], step1[28]); + step2[29] = _mm_sub_epi32(step1[29], step1[28]); // step2[29] = -step2[29] + step2[30] = _mm_sub_epi32(step1[31], step1[30]); + step2[31] = _mm_add_epi32(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64, + &step1[29], &step1[18]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64, + &step1[25], &step1[22]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_1024_4x32_quarter_1_2(io, temp); + highbd_idct32_1024_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_1024_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[4][32], io[32]; + + // rows + for (i = 0; i < 4; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]); + highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]); + highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]); + idct32_1024_8x32(io, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + // Transpose 32x8 block to 8x32 block + transpose_16bit_8x8(col[0] + i, io); + transpose_16bit_8x8(col[1] + i, io + 8); + transpose_16bit_8x8(col[2] + i, io + 16); + transpose_16bit_8x8(col[3] + i, io + 24); + idct32_1024_8x32(io, io); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, io[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 8; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]); + highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]); + highbd_idct32_1024_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + transpose_32bit_4x4(all[4] + i, out + 16); + transpose_32bit_4x4(all[5] + i, out + 20); + transpose_32bit_4x4(all[6] + i, out + 24); + transpose_32bit_4x4(all[7] + i, out + 28); + highbd_idct32_1024_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +// ----------------------------------------------------------------------------- + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_partial_butterfly_sse2(in[4], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_partial_butterfly_neg_sse2(in[12], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + + // stage 4 + highbd_partial_butterfly_sse2(in[0], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + highbd_partial_butterfly_sse2(in[8], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_2( + const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_neg_sse2(in[14], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_partial_butterfly_sse2(in[10], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] + step1[11] = _mm_add_epi32(step2[10], step2[11]); + step1[12] = _mm_add_epi32(step2[13], step2[12]); + step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] + + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_135_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_135_4x32_quarter_1(in, temp); + highbd_idct32_135_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_partial_butterfly_sse2(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_partial_butterfly_neg_sse2(in[15], cospi_15_64, cospi_17_64, + &step1[17], &step1[30]); + highbd_partial_butterfly_sse2(in[9], cospi_23_64, cospi_9_64, &step1[18], + &step1[29]); + highbd_partial_butterfly_neg_sse2(in[7], cospi_7_64, cospi_25_64, &step1[19], + &step1[28]); + + highbd_partial_butterfly_sse2(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_partial_butterfly_neg_sse2(in[11], cospi_11_64, cospi_21_64, + &step1[21], &step1[26]); + + highbd_partial_butterfly_sse2(in[13], cospi_19_64, cospi_13_64, &step1[22], + &step1[25]); + highbd_partial_butterfly_neg_sse2(in[3], cospi_3_64, cospi_29_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi32(step1[16], step1[17]); + step2[17] = _mm_sub_epi32(step1[16], step1[17]); + step2[18] = _mm_sub_epi32(step1[18], step1[19]); // step2[18] = -step2[18] + step2[19] = _mm_add_epi32(step1[18], step1[19]); + step2[20] = _mm_add_epi32(step1[20], step1[21]); + step2[21] = _mm_sub_epi32(step1[20], step1[21]); + step2[22] = _mm_sub_epi32(step1[22], step1[23]); // step2[22] = -step2[22] + step2[23] = _mm_add_epi32(step1[22], step1[23]); + + step2[24] = _mm_add_epi32(step1[25], step1[24]); + step2[25] = _mm_sub_epi32(step1[25], step1[24]); // step2[25] = -step2[25] + step2[26] = _mm_sub_epi32(step1[27], step1[26]); + step2[27] = _mm_add_epi32(step1[27], step1[26]); + step2[28] = _mm_add_epi32(step1[29], step1[28]); + step2[29] = _mm_sub_epi32(step1[29], step1[28]); // step2[29] = -step2[29] + step2[30] = _mm_sub_epi32(step1[31], step1[30]); + step2[31] = _mm_add_epi32(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64, + &step1[29], &step1[18]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64, + &step1[25], &step1[22]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_135_4x32_quarter_1_2(io, temp); + highbd_idct32_135_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_135_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[2][32], in[32], out[32]; + + for (i = 16; i < 32; i++) { + in[i] = _mm_setzero_si128(); + } + + // rows + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]); + idct32_1024_8x32(in, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col[0] + i, in); + transpose_16bit_8x8(col[1] + i, in + 8); + idct32_1024_8x32(in, out); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_idct32_135_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct32_135_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +// ----------------------------------------------------------------------------- + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_partial_butterfly_sse2(in[4], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + + // stage 4 + highbd_partial_butterfly_sse2(in[0], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/, + __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[14] = step2[15]; + step1[15] = step2[15]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + + step1[10] = + _mm_sub_epi32(_mm_setzero_si128(), step1[10]); // step1[10] = -step1[10] + step1[13] = + _mm_sub_epi32(_mm_setzero_si128(), step1[13]); // step1[13] = -step1[13] + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_34_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_34_4x32_quarter_1(in, temp); + highbd_idct32_34_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_partial_butterfly_sse2(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_partial_butterfly_neg_sse2(in[7], cospi_7_64, cospi_25_64, &step1[19], + &step1[28]); + + highbd_partial_butterfly_sse2(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_partial_butterfly_neg_sse2(in[3], cospi_3_64, cospi_29_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = step1[16]; + step2[17] = step1[16]; + step2[18] = step1[19]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[21] = step1[20]; + step2[22] = step1[23]; + step2[23] = step1[23]; + + step2[24] = step1[24]; + step2[25] = step1[24]; + step2[26] = step1[27]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[29] = step1[28]; + step2[30] = step1[31]; + step2[31] = step1[31]; + + // stage 3 + step2[18] = + _mm_sub_epi32(_mm_setzero_si128(), step2[18]); // step2[18] = -step2[18] + step2[22] = + _mm_sub_epi32(_mm_setzero_si128(), step2[22]); // step2[22] = -step2[22] + step2[25] = + _mm_sub_epi32(_mm_setzero_si128(), step2[25]); // step2[25] = -step2[25] + step2[29] = + _mm_sub_epi32(_mm_setzero_si128(), step2[29]); // step2[29] = -step2[29] + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64, + &step1[29], &step1[18]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64, + &step1[25], &step1[22]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_34_4x32_quarter_1_2(io, temp); + highbd_idct32_34_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_34_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[32], in[32], out[32]; + + // rows + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]); + idct32_34_8x32_sse2(in, col); + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col + i, in); + idct32_34_8x32_sse2(in, out); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_idct32_34_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct32_34_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd) { highbd_idct_1_add_kernel(input, dest, stride, bd, 32); diff --git a/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c b/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c new file mode 100644 index 000000000..2d0a53ac0 --- /dev/null +++ b/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c @@ -0,0 +1,765 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <smmintrin.h> // SSE4.1 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_ssse3.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6( + __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) { + __m128i step2[32]; + + // stage 4 + step2[8] = step1[8]; + step2[15] = step1[15]; + highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64, + &step2[9], &step2[14]); + highbd_butterfly_sse4_1(step1[13], step1[10], -cospi_8_64, cospi_24_64, + &step2[10], &step2[13]); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[8] = _mm_add_epi32(step2[8], step2[11]); + step1[9] = _mm_add_epi32(step2[9], step2[10]); + step1[10] = _mm_sub_epi32(step2[9], step2[10]); + step1[11] = _mm_sub_epi32(step2[8], step2[11]); + step1[12] = _mm_sub_epi32(step2[15], step2[12]); + step1[13] = _mm_sub_epi32(step2[14], step2[13]); + step1[14] = _mm_add_epi32(step2[14], step2[13]); + step1[15] = _mm_add_epi32(step2[15], step2[12]); + + // stage 6 + out[8] = step1[8]; + out[9] = step1[9]; + highbd_butterfly_sse4_1(step1[13], step1[10], cospi_16_64, cospi_16_64, + &out[10], &out[13]); + highbd_butterfly_sse4_1(step1[12], step1[11], cospi_16_64, cospi_16_64, + &out[11], &out[12]); + out[14] = step1[14]; + out[15] = step1[15]; +} + +static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7( + __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) { + __m128i step2[32]; + + // stage 4 + step2[16] = _mm_add_epi32(step1[16], step1[19]); + step2[17] = _mm_add_epi32(step1[17], step1[18]); + step2[18] = _mm_sub_epi32(step1[17], step1[18]); + step2[19] = _mm_sub_epi32(step1[16], step1[19]); + step2[20] = _mm_sub_epi32(step1[23], step1[20]); + step2[21] = _mm_sub_epi32(step1[22], step1[21]); + step2[22] = _mm_add_epi32(step1[22], step1[21]); + step2[23] = _mm_add_epi32(step1[23], step1[20]); + + step2[24] = _mm_add_epi32(step1[24], step1[27]); + step2[25] = _mm_add_epi32(step1[25], step1[26]); + step2[26] = _mm_sub_epi32(step1[25], step1[26]); + step2[27] = _mm_sub_epi32(step1[24], step1[27]); + step2[28] = _mm_sub_epi32(step1[31], step1[28]); + step2[29] = _mm_sub_epi32(step1[30], step1[29]); + step2[30] = _mm_add_epi32(step1[29], step1[30]); + step2[31] = _mm_add_epi32(step1[28], step1[31]); + + // stage 5 + step1[16] = step2[16]; + step1[17] = step2[17]; + highbd_butterfly_sse4_1(step2[29], step2[18], cospi_24_64, cospi_8_64, + &step1[18], &step1[29]); + highbd_butterfly_sse4_1(step2[28], step2[19], cospi_24_64, cospi_8_64, + &step1[19], &step1[28]); + highbd_butterfly_sse4_1(step2[27], step2[20], -cospi_8_64, cospi_24_64, + &step1[20], &step1[27]); + highbd_butterfly_sse4_1(step2[26], step2[21], -cospi_8_64, cospi_24_64, + &step1[21], &step1[26]); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[16] = _mm_add_epi32(step1[16], step1[23]); + step2[17] = _mm_add_epi32(step1[17], step1[22]); + step2[18] = _mm_add_epi32(step1[18], step1[21]); + step2[19] = _mm_add_epi32(step1[19], step1[20]); + step2[20] = _mm_sub_epi32(step1[19], step1[20]); + step2[21] = _mm_sub_epi32(step1[18], step1[21]); + step2[22] = _mm_sub_epi32(step1[17], step1[22]); + step2[23] = _mm_sub_epi32(step1[16], step1[23]); + + step2[24] = _mm_sub_epi32(step1[31], step1[24]); + step2[25] = _mm_sub_epi32(step1[30], step1[25]); + step2[26] = _mm_sub_epi32(step1[29], step1[26]); + step2[27] = _mm_sub_epi32(step1[28], step1[27]); + step2[28] = _mm_add_epi32(step1[27], step1[28]); + step2[29] = _mm_add_epi32(step1[26], step1[29]); + step2[30] = _mm_add_epi32(step1[25], step1[30]); + step2[31] = _mm_add_epi32(step1[24], step1[31]); + + // stage 7 + out[16] = step2[16]; + out[17] = step2[17]; + out[18] = step2[18]; + out[19] = step2[19]; + highbd_butterfly_sse4_1(step2[27], step2[20], cospi_16_64, cospi_16_64, + &out[20], &out[27]); + highbd_butterfly_sse4_1(step2[26], step2[21], cospi_16_64, cospi_16_64, + &out[21], &out[26]); + highbd_butterfly_sse4_1(step2[25], step2[22], cospi_16_64, cospi_16_64, + &out[22], &out[25]); + highbd_butterfly_sse4_1(step2[24], step2[23], cospi_16_64, cospi_16_64, + &out[23], &out[24]); + out[28] = step2[28]; + out[29] = step2[29]; + out[30] = step2[30]; + out[31] = step2[31]; +} + +// Group the coefficient calculation into smaller functions to prevent stack +// spillover in 32x32 idct optimizations: +// quarter_1: 0-7 +// quarter_2: 8-15 +// quarter_3_4: 16-23, 24-31 + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_butterfly_sse4_1(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse4_1(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + + // stage 4 + highbd_butterfly_sse4_1(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + highbd_butterfly_sse4_1(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64, + &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_2( + const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_butterfly_sse4_1(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_butterfly_sse4_1(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_butterfly_sse4_1(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_butterfly_sse4_1(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + step1[10] = _mm_sub_epi32(step2[11], step2[10]); + step1[11] = _mm_add_epi32(step2[11], step2[10]); + step1[12] = _mm_add_epi32(step2[12], step2[13]); + step1[13] = _mm_sub_epi32(step2[12], step2[13]); + + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_1024_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_1024_4x32_quarter_1(in, temp); + highbd_idct32_1024_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_butterfly_sse4_1(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_butterfly_sse4_1(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], + &step1[30]); + highbd_butterfly_sse4_1(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], + &step1[29]); + highbd_butterfly_sse4_1(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], + &step1[28]); + + highbd_butterfly_sse4_1(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_butterfly_sse4_1(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], + &step1[26]); + + highbd_butterfly_sse4_1(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], + &step1[25]); + highbd_butterfly_sse4_1(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi32(step1[16], step1[17]); + step2[17] = _mm_sub_epi32(step1[16], step1[17]); + step2[18] = _mm_sub_epi32(step1[19], step1[18]); + step2[19] = _mm_add_epi32(step1[19], step1[18]); + step2[20] = _mm_add_epi32(step1[20], step1[21]); + step2[21] = _mm_sub_epi32(step1[20], step1[21]); + step2[22] = _mm_sub_epi32(step1[23], step1[22]); + step2[23] = _mm_add_epi32(step1[23], step1[22]); + + step2[24] = _mm_add_epi32(step1[24], step1[25]); + step2[25] = _mm_sub_epi32(step1[24], step1[25]); + step2[26] = _mm_sub_epi32(step1[27], step1[26]); + step2[27] = _mm_add_epi32(step1[27], step1[26]); + step2[28] = _mm_add_epi32(step1[28], step1[29]); + step2[29] = _mm_sub_epi32(step1[28], step1[29]); + step2[30] = _mm_sub_epi32(step1[31], step1[30]); + step2[31] = _mm_add_epi32(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64, + &step1[18], &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64, + &step1[22], &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_1024_4x32_quarter_1_2(io, temp); + highbd_idct32_1024_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_1024_add_sse4_1(const tran_low_t *input, + uint16_t *dest, int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[4][32], io[32]; + + // rows + for (i = 0; i < 4; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]); + highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]); + highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]); + idct32_1024_8x32(io, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + // Transpose 32x8 block to 8x32 block + transpose_16bit_8x8(col[0] + i, io); + transpose_16bit_8x8(col[1] + i, io + 8); + transpose_16bit_8x8(col[2] + i, io + 16); + transpose_16bit_8x8(col[3] + i, io + 24); + idct32_1024_8x32(io, io); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, io[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 8; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]); + highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]); + highbd_idct32_1024_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + transpose_32bit_4x4(all[4] + i, out + 16); + transpose_32bit_4x4(all[5] + i, out + 20); + transpose_32bit_4x4(all[6] + i, out + 24); + transpose_32bit_4x4(all[7] + i, out + 28); + highbd_idct32_1024_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +// ----------------------------------------------------------------------------- + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_partial_butterfly_sse4_1(in[12], -cospi_20_64, cospi_12_64, &step1[5], + &step1[6]); + + // stage 4 + highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + highbd_partial_butterfly_sse4_1(in[8], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64, + &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_2( + const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_sse4_1(in[14], -cospi_18_64, cospi_14_64, &step2[9], + &step2[14]); + highbd_partial_butterfly_sse4_1(in[10], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + step1[10] = _mm_sub_epi32(step2[11], step2[10]); + step1[11] = _mm_add_epi32(step2[11], step2[10]); + step1[12] = _mm_add_epi32(step2[12], step2[13]); + step1[13] = _mm_sub_epi32(step2[12], step2[13]); + + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_135_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_135_4x32_quarter_1(in, temp); + highbd_idct32_135_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_partial_butterfly_sse4_1(in[15], -cospi_17_64, cospi_15_64, &step1[17], + &step1[30]); + highbd_partial_butterfly_sse4_1(in[9], cospi_23_64, cospi_9_64, &step1[18], + &step1[29]); + highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19], + &step1[28]); + + highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_partial_butterfly_sse4_1(in[11], -cospi_21_64, cospi_11_64, &step1[21], + &step1[26]); + + highbd_partial_butterfly_sse4_1(in[13], cospi_19_64, cospi_13_64, &step1[22], + &step1[25]); + highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi32(step1[16], step1[17]); + step2[17] = _mm_sub_epi32(step1[16], step1[17]); + step2[18] = _mm_sub_epi32(step1[19], step1[18]); + step2[19] = _mm_add_epi32(step1[19], step1[18]); + step2[20] = _mm_add_epi32(step1[20], step1[21]); + step2[21] = _mm_sub_epi32(step1[20], step1[21]); + step2[22] = _mm_sub_epi32(step1[23], step1[22]); + step2[23] = _mm_add_epi32(step1[23], step1[22]); + + step2[24] = _mm_add_epi32(step1[24], step1[25]); + step2[25] = _mm_sub_epi32(step1[24], step1[25]); + step2[26] = _mm_sub_epi32(step1[27], step1[26]); + step2[27] = _mm_add_epi32(step1[27], step1[26]); + step2[28] = _mm_add_epi32(step1[28], step1[29]); + step2[29] = _mm_sub_epi32(step1[28], step1[29]); + step2[30] = _mm_sub_epi32(step1[31], step1[30]); + step2[31] = _mm_add_epi32(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64, + &step1[18], &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64, + &step1[22], &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_135_4x32_quarter_1_2(io, temp); + highbd_idct32_135_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_135_add_sse4_1(const tran_low_t *input, + uint16_t *dest, int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[2][32], in[32], out[32]; + + // rows + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]); + idct32_135_8x32_ssse3(in, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col[0] + i, in); + transpose_16bit_8x8(col[1] + i, in + 8); + idct32_135_8x32_ssse3(in, out); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_idct32_135_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct32_135_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +// ----------------------------------------------------------------------------- + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + + // stage 4 + highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64, + &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/, + __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[14] = step2[15]; + step1[15] = step2[15]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_34_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_34_4x32_quarter_1(in, temp); + highbd_idct32_34_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19], + &step1[28]); + + highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = step1[16]; + step2[17] = step1[16]; + step2[18] = step1[19]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[21] = step1[20]; + step2[22] = step1[23]; + step2[23] = step1[23]; + + step2[24] = step1[24]; + step2[25] = step1[24]; + step2[26] = step1[27]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[29] = step1[28]; + step2[30] = step1[31]; + step2[31] = step1[31]; + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64, + &step1[18], &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64, + &step1[22], &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_34_4x32_quarter_1_2(io, temp); + highbd_idct32_34_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_34_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[32], in[32], out[32]; + + // rows + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]); + idct32_34_8x32_ssse3(in, col); + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col + i, in); + idct32_34_8x32_ssse3(in, out); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_idct32_34_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct32_34_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} diff --git a/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c index 89a2584e3..2e54d2473 100644 --- a/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c +++ b/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c @@ -8,144 +8,152 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <emmintrin.h> // SSE2 + #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" #include "vpx_dsp/x86/transpose_sse2.h" -#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE __m128i dct_const_round_shift_4_sse2(const __m128i in0, + const __m128i in1) { + const __m128i t0 = _mm_unpacklo_epi32(in0, in1); // 0, 1 + const __m128i t1 = _mm_unpackhi_epi32(in0, in1); // 2, 3 + const __m128i t2 = _mm_unpacklo_epi64(t0, t1); // 0, 1, 2, 3 + return dct_const_round_shift_sse2(t2); +} + +static INLINE void highbd_idct4_small_sse2(__m128i *const io) { + const __m128i cospi_p16_p16 = _mm_setr_epi32(cospi_16_64, 0, cospi_16_64, 0); + const __m128i cospi_p08_p08 = _mm_setr_epi32(cospi_8_64, 0, cospi_8_64, 0); + const __m128i cospi_p24_p24 = _mm_setr_epi32(cospi_24_64, 0, cospi_24_64, 0); + __m128i temp1[4], temp2[4], step[4]; + + transpose_32bit_4x4(io, io); + + // Note: There is no 32-bit signed multiply SIMD instruction in SSE2. + // _mm_mul_epu32() is used which can only guarantee the lower 32-bit + // (signed) result is meaningful, which is enough in this function. + + // stage 1 + temp1[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] + temp2[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] + temp1[1] = _mm_srli_si128(temp1[0], 4); // 1, 3 + temp2[1] = _mm_srli_si128(temp2[0], 4); // 1, 3 + temp1[0] = _mm_mul_epu32(temp1[0], cospi_p16_p16); // ([0] + [2])*cospi_16_64 + temp1[1] = _mm_mul_epu32(temp1[1], cospi_p16_p16); // ([0] + [2])*cospi_16_64 + temp2[0] = _mm_mul_epu32(temp2[0], cospi_p16_p16); // ([0] - [2])*cospi_16_64 + temp2[1] = _mm_mul_epu32(temp2[1], cospi_p16_p16); // ([0] - [2])*cospi_16_64 + step[0] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]); + step[1] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]); + + temp1[3] = _mm_srli_si128(io[1], 4); + temp2[3] = _mm_srli_si128(io[3], 4); + temp1[0] = _mm_mul_epu32(io[1], cospi_p24_p24); // input[1] * cospi_24_64 + temp1[1] = _mm_mul_epu32(temp1[3], cospi_p24_p24); // input[1] * cospi_24_64 + temp2[0] = _mm_mul_epu32(io[1], cospi_p08_p08); // input[1] * cospi_8_64 + temp2[1] = _mm_mul_epu32(temp1[3], cospi_p08_p08); // input[1] * cospi_8_64 + temp1[2] = _mm_mul_epu32(io[3], cospi_p08_p08); // input[3] * cospi_8_64 + temp1[3] = _mm_mul_epu32(temp2[3], cospi_p08_p08); // input[3] * cospi_8_64 + temp2[2] = _mm_mul_epu32(io[3], cospi_p24_p24); // input[3] * cospi_24_64 + temp2[3] = _mm_mul_epu32(temp2[3], cospi_p24_p24); // input[3] * cospi_24_64 + temp1[0] = _mm_sub_epi64(temp1[0], temp1[2]); // [1]*cospi_24 - [3]*cospi_8 + temp1[1] = _mm_sub_epi64(temp1[1], temp1[3]); // [1]*cospi_24 - [3]*cospi_8 + temp2[0] = _mm_add_epi64(temp2[0], temp2[2]); // [1]*cospi_8 + [3]*cospi_24 + temp2[1] = _mm_add_epi64(temp2[1], temp2[3]); // [1]*cospi_8 + [3]*cospi_24 + step[2] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]); + step[3] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]); + + // stage 2 + io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] + io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2] + io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2] + io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3] +} + +static INLINE void highbd_idct4_large_sse2(__m128i *const io) { + __m128i step[4]; + + transpose_32bit_4x4(io, io); + + // stage 1 + highbd_butterfly_cospi16_sse2(io[0], io[2], &step[0], &step[1]); + highbd_butterfly_sse2(io[1], io[3], cospi_24_64, cospi_8_64, &step[2], + &step[3]); + + // stage 2 + io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] + io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2] + io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2] + io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3] +} void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - int i, j; - __m128i inptr[4]; - __m128i sign_bits[2]; - __m128i temp_mm, min_input, max_input; - int test; - int optimised_cols = 0; - const __m128i zero = _mm_set1_epi16(0); - const __m128i eight = _mm_set1_epi16(8); - const __m128i max = _mm_set1_epi16(12043); - const __m128i min = _mm_set1_epi16(-12043); - // Load input into __m128i - inptr[0] = _mm_loadu_si128((const __m128i *)input); - inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); - inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); - inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); - - // Pack to 16 bits - inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]); - inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]); - - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp_mm = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp_mm); - - if (!test) { - // Do the row transform - idct4_sse2(inptr); - - // Check the min & max values - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp_mm = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp_mm); - - if (test) { - transpose_16bit_4x4(inptr); - sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); - sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); - inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); - inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); - inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); - inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); - _mm_storeu_si128((__m128i *)outptr, inptr[0]); - _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); - _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); - _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vpx_highbd_idct4_c(input, outptr, bd); - input += 4; - outptr += 4; - } + int16_t max = 0, min = 0; + __m128i io[4], io_short[2]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 8)); + io[3] = _mm_load_si128((const __m128i *)(input + 12)); + + io_short[0] = _mm_packs_epi32(io[0], io[1]); + io_short[1] = _mm_packs_epi32(io[2], io[3]); + + if (bd != 8) { + __m128i max_input, min_input; + + max_input = _mm_max_epi16(io_short[0], io_short[1]); + min_input = _mm_min_epi16(io_short[0], io_short[1]); + max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 8)); + min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 8)); + max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 4)); + min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 4)); + max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 2)); + min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 2)); + max = _mm_extract_epi16(max_input, 0); + min = _mm_extract_epi16(min_input, 0); } - if (optimised_cols) { - idct4_sse2(inptr); - - // Final round and shift - inptr[0] = _mm_add_epi16(inptr[0], eight); - inptr[1] = _mm_add_epi16(inptr[1], eight); - - inptr[0] = _mm_srai_epi16(inptr[0], 4); - inptr[1] = _mm_srai_epi16(inptr[1], 4); - - // Reconstruction and Store - { - __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); - __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi64( - d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); - d2 = _mm_unpacklo_epi64( - d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); - d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); - d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); - // store input0 - _mm_storel_epi64((__m128i *)dest, d0); - // store input1 - d0 = _mm_srli_si128(d0, 8); - _mm_storel_epi64((__m128i *)(dest + stride), d0); - // store input2 - _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); - // store input3 - d2 = _mm_srli_si128(d2, 8); - _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); - } + if (bd == 8 || (max < 4096 && min >= -4096)) { + idct4_sse2(io_short); + idct4_sse2(io_short); + io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8)); + io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8)); + io[0] = _mm_srai_epi16(io_short[0], 4); + io[1] = _mm_srai_epi16(io_short[1], 4); } else { - // Run the un-optimised column transform - tran_low_t temp_in[4], temp_out[4]; - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - vpx_highbd_idct4_c(temp_in, temp_out, bd); - for (j = 0; j < 4; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); - } + if (max < 32767 && min > -32768) { + highbd_idct4_small_sse2(io); + highbd_idct4_small_sse2(io); + } else { + highbd_idct4_large_sse2(io); + highbd_idct4_large_sse2(io); } + io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8)); + io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8)); } + + recon_and_store_4x4(io, dest, stride, bd); } void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - const __m128i zero = _mm_setzero_si128(); - // Faster than _mm_set1_epi16((1 << bd) - 1). - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); int a1, i; tran_low_t out; __m128i dc, d; - out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 4); dc = _mm_set1_epi16(a1); for (i = 0; i < 4; ++i) { d = _mm_loadl_epi64((const __m128i *)dest); - d = add_dc_clamp(&zero, &max, &dc, &d); + d = add_clamp(d, dc, bd); _mm_storel_epi64((__m128i *)dest, d); dest += stride; } diff --git a/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c b/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c new file mode 100644 index 000000000..38e64f3bc --- /dev/null +++ b/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <smmintrin.h> // SSE4.1 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" + +static INLINE void highbd_idct4(__m128i *const io) { + __m128i temp[2], step[4]; + + transpose_32bit_4x4(io, io); + + // stage 1 + temp[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] + extend_64bit(temp[0], temp); + step[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64); + temp[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] + extend_64bit(temp[0], temp); + step[1] = multiplication_round_shift_sse4_1(temp, cospi_16_64); + highbd_butterfly_sse4_1(io[1], io[3], cospi_24_64, cospi_8_64, &step[2], + &step[3]); + + // stage 2 + io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] + io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2] + io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2] + io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3] +} + +void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + __m128i io[4]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 8)); + io[3] = _mm_load_si128((const __m128i *)(input + 12)); + + if (bd == 8) { + __m128i io_short[2]; + + io_short[0] = _mm_packs_epi32(io[0], io[1]); + io_short[1] = _mm_packs_epi32(io[2], io[3]); + idct4_sse2(io_short); + idct4_sse2(io_short); + io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8)); + io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8)); + io[0] = _mm_srai_epi16(io_short[0], 4); + io[1] = _mm_srai_epi16(io_short[1], 4); + } else { + highbd_idct4(io); + highbd_idct4(io); + io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8)); + io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8)); + } + + recon_and_store_4x4(io, dest, stride, bd); +} diff --git a/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c index 29cc1d30e..909a6b794 100644 --- a/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c +++ b/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c @@ -8,211 +8,203 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <emmintrin.h> // SSE2 + #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" #include "vpx_dsp/x86/transpose_sse2.h" -#include "vpx_dsp/x86/txfm_common_sse2.h" + +static void highbd_idct8x8_half1d(__m128i *const io) { + __m128i step1[8], step2[8]; + + transpose_32bit_4x4x2(io, io); + + // stage 1 + step1[0] = io[0]; + step1[2] = io[4]; + step1[1] = io[2]; + step1[3] = io[6]; + highbd_butterfly_sse2(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse2(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + + // stage 2 + highbd_butterfly_cospi16_sse2(step1[0], step1[2], &step2[0], &step2[1]); + highbd_butterfly_sse2(step1[1], step1[3], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]); + step1[7] = step2[7]; + + // stage 4 + highbd_idct8_stage4(step1, io); +} + +static void highbd_idct8x8_12_half1d(__m128i *const io) { + __m128i temp1[4], sign[2], step1[8], step2[8]; + + transpose_32bit_4x4(io, io); + + // stage 1 + step1[0] = io[0]; + step1[1] = io[2]; + abs_extend_64bit_sse2(io[1], temp1, sign); + step1[4] = multiplication_round_shift_sse2(temp1, sign, cospi_28_64); + step1[7] = multiplication_round_shift_sse2(temp1, sign, cospi_4_64); + abs_extend_64bit_sse2(io[3], temp1, sign); + step1[5] = multiplication_neg_round_shift_sse2(temp1, sign, cospi_20_64); + step1[6] = multiplication_round_shift_sse2(temp1, sign, cospi_12_64); + + // stage 2 + abs_extend_64bit_sse2(step1[0], temp1, sign); + step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64); + abs_extend_64bit_sse2(step1[1], temp1, sign); + step2[2] = multiplication_round_shift_sse2(temp1, sign, cospi_24_64); + step2[3] = multiplication_round_shift_sse2(temp1, sign, cospi_8_64); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[0], step2[2]); + step1[2] = _mm_sub_epi32(step2[0], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]); + step1[7] = step2[7]; + + // stage 4 + highbd_idct8_stage4(step1, io); +} void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - tran_low_t out[8 * 8]; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[8]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - const __m128i zero = _mm_set1_epi16(0); - const __m128i sixteen = _mm_set1_epi16(16); - const __m128i max = _mm_set1_epi16(6201); - const __m128i min = _mm_set1_epi16(-6201); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 8; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct8_sse2(inptr); - - // Find the min & max for the column transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - array_transpose_8x8(inptr, inptr); - for (i = 0; i < 8; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); - temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], io[4]); + io_short[1] = _mm_packs_epi32(io[1], io[5]); + io_short[2] = _mm_packs_epi32(io[2], io[6]); + io_short[3] = _mm_packs_epi32(io[3], io[7]); + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + io_short[4] = _mm_packs_epi32(io[8], io[12]); + io_short[5] = _mm_packs_epi32(io[9], io[13]); + io_short[6] = _mm_packs_epi32(io[10], io[14]); + io_short[7] = _mm_packs_epi32(io[11], io[15]); + + idct8_sse2(io_short); + idct8_sse2(io_short); + round_shift_8x8(io_short, io); } else { - // Run the un-optimised row transform - for (i = 0; i < 8; ++i) { - vpx_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } + __m128i temp[4]; + + highbd_idct8x8_half1d(io); + + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + highbd_idct8x8_half1d(&io[8]); + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + io[4] = io[8]; + io[5] = io[9]; + io[6] = io[10]; + io[7] = io[11]; + highbd_idct8x8_half1d(io); + + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_idct8x8_half1d(&io[8]); + + highbd_idct8x8_final_round(io); } - if (optimised_cols) { - idct8_sse2(inptr); - - // Final round & shift and Reconstruction and Store - { - __m128i d[8]; - for (i = 0; i < 8; i++) { - inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[8], temp_out[8]; - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - vpx_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } - } + recon_and_store_8x8(io, dest, stride, bd); } void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - tran_low_t out[8 * 8] = { 0 }; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[8]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - const __m128i zero = _mm_set1_epi16(0); - const __m128i sixteen = _mm_set1_epi16(16); - const __m128i max = _mm_set1_epi16(6201); - const __m128i min = _mm_set1_epi16(-6201); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 8; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - } + const __m128i zero = _mm_setzero_si128(); + __m128i io[16]; - // Find the min & max for the row transform - // only first 4 row has non-zero coefs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 4; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct8_sse2(inptr); - - // Find the min & max for the column transform - // N.B. Only first 4 cols contain non-zero coeffs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - // Use fact only first 4 rows contain non-zero coeffs - array_transpose_4X8(inptr, inptr); - for (i = 0; i < 4; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); - temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vpx_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } - } + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], zero); + io_short[1] = _mm_packs_epi32(io[1], zero); + io_short[2] = _mm_packs_epi32(io[2], zero); + io_short[3] = _mm_packs_epi32(io[3], zero); - if (optimised_cols) { - idct8_sse2(inptr); - - // Final round & shift and Reconstruction and Store - { - __m128i d[8]; - for (i = 0; i < 8; i++) { - inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); - } - } + idct8x8_12_add_kernel_sse2(io_short); + round_shift_8x8(io_short, io); } else { - // Run the un-optimised column transform - tran_low_t temp_in[8], temp_out[8]; - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - vpx_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } + __m128i temp[4]; + + highbd_idct8x8_12_half1d(io); + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + highbd_idct8x8_12_half1d(io); + + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_idct8x8_12_half1d(&io[8]); + + highbd_idct8x8_final_round(io); } + + recon_and_store_8x8(io, dest, stride, bd); } void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest, diff --git a/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c new file mode 100644 index 000000000..ae391b2c0 --- /dev/null +++ b/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <smmintrin.h> // SSE4.1 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_ssse3.h" +#include "vpx_dsp/x86/transpose_sse2.h" + +static void highbd_idct8x8_half1d(__m128i *const io) { + __m128i step1[8], step2[8]; + + transpose_32bit_4x4x2(io, io); + + // stage 1 + step1[0] = io[0]; + step1[2] = io[4]; + step1[1] = io[2]; + step1[3] = io[6]; + highbd_butterfly_sse4_1(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse4_1(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + + // stage 2 + highbd_butterfly_cospi16_sse4_1(step1[0], step1[2], &step2[0], &step2[1]); + highbd_butterfly_sse4_1(step1[1], step1[3], cospi_24_64, cospi_8_64, + &step2[2], &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]); + step1[7] = step2[7]; + + // stage 4 + highbd_idct8_stage4(step1, io); +} + +static void highbd_idct8x8_12_half1d(__m128i *const io) { + __m128i temp1[2], step1[8], step2[8]; + + transpose_32bit_4x4(io, io); + + // stage 1 + step1[0] = io[0]; + step1[1] = io[2]; + extend_64bit(io[1], temp1); + step1[4] = multiplication_round_shift_sse4_1(temp1, cospi_28_64); + step1[7] = multiplication_round_shift_sse4_1(temp1, cospi_4_64); + extend_64bit(io[3], temp1); + step1[5] = multiplication_round_shift_sse4_1(temp1, -cospi_20_64); + step1[6] = multiplication_round_shift_sse4_1(temp1, cospi_12_64); + + // stage 2 + extend_64bit(step1[0], temp1); + step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64); + extend_64bit(step1[1], temp1); + step2[2] = multiplication_round_shift_sse4_1(temp1, cospi_24_64); + step2[3] = multiplication_round_shift_sse4_1(temp1, cospi_8_64); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[0], step2[2]); + step1[2] = _mm_sub_epi32(step2[0], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]); + step1[7] = step2[7]; + + // stage 4 + highbd_idct8_stage4(step1, io); +} + +void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], io[4]); + io_short[1] = _mm_packs_epi32(io[1], io[5]); + io_short[2] = _mm_packs_epi32(io[2], io[6]); + io_short[3] = _mm_packs_epi32(io[3], io[7]); + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + io_short[4] = _mm_packs_epi32(io[8], io[12]); + io_short[5] = _mm_packs_epi32(io[9], io[13]); + io_short[6] = _mm_packs_epi32(io[10], io[14]); + io_short[7] = _mm_packs_epi32(io[11], io[15]); + + idct8_sse2(io_short); + idct8_sse2(io_short); + round_shift_8x8(io_short, io); + } else { + __m128i temp[4]; + + highbd_idct8x8_half1d(io); + + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + highbd_idct8x8_half1d(&io[8]); + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + io[4] = io[8]; + io[5] = io[9]; + io[6] = io[10]; + io[7] = io[11]; + highbd_idct8x8_half1d(io); + + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_idct8x8_half1d(&io[8]); + + highbd_idct8x8_final_round(io); + } + + recon_and_store_8x8(io, dest, stride, bd); +} + +void vpx_highbd_idct8x8_12_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], zero); + io_short[1] = _mm_packs_epi32(io[1], zero); + io_short[2] = _mm_packs_epi32(io[2], zero); + io_short[3] = _mm_packs_epi32(io[3], zero); + + idct8x8_12_add_kernel_ssse3(io_short); + round_shift_8x8(io_short, io); + } else { + __m128i temp[4]; + + highbd_idct8x8_12_half1d(io); + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + highbd_idct8x8_12_half1d(io); + + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_idct8x8_12_half1d(&io[8]); + + highbd_idct8x8_final_round(io); + } + + recon_and_store_8x8(io, dest, stride, bd); +} diff --git a/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c b/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c new file mode 100644 index 000000000..2051381aa --- /dev/null +++ b/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c @@ -0,0 +1,533 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <emmintrin.h> // SSE2 + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +// ----------------------------------------------------------------------------- + +void vpx_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void vpx_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7)); +} + +static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)*dst, val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +void vpx_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_16_unpacklo(&dst, stride, &row0); + h_store_16_unpacklo(&dst, stride, &row1); + h_store_16_unpacklo(&dst, stride, &row2); + h_store_16_unpacklo(&dst, stride, &row3); + h_store_16_unpackhi(&dst, stride, &row4); + h_store_16_unpackhi(&dst, stride, &row5); + h_store_16_unpackhi(&dst, stride, &row6); + h_store_16_unpackhi(&dst, stride, &row7); + } +} + +static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +void vpx_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_32_unpacklo(&dst, stride, &row0); + h_store_32_unpacklo(&dst, stride, &row1); + h_store_32_unpacklo(&dst, stride, &row2); + h_store_32_unpacklo(&dst, stride, &row3); + h_store_32_unpackhi(&dst, stride, &row4); + h_store_32_unpackhi(&dst, stride, &row5); + h_store_32_unpackhi(&dst, stride, &row6); + h_store_32_unpackhi(&dst, stride, &row7); + } +} + +//------------------------------------------------------------------------------ +// DC 4x4 + +static INLINE __m128i dc_sum_4(const uint16_t *ref) { + const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref); + const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); + const __m128i a = _mm_add_epi16(_dcba, _xxdc); + return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); +} + +static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); + int i; + for (i = 0; i < 4; ++i, dst += stride) { + _mm_storel_epi64((__m128i *)dst, dc_dup); + } +} + +void vpx_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)above; + (void)bd; + dc_store_4x4(dst, stride, &dc); +} + +void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)left; + (void)bd; + dc_store_4x4(dst, stride, &dc); +} + +void vpx_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_4x4(dst, stride, &dc_dup); +} + +//------------------------------------------------------------------------------ +// DC 8x8 + +static INLINE __m128i dc_sum_8(const uint16_t *ref) { + const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref); + const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8)); + const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); + const __m128i a = _mm_add_epi16(_dcba, _xxdc); + + return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); +} + +static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < 8; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + } +} + +void vpx_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_8x8(dst, stride, &dc); +} + +void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)left; + (void)bd; + dc_store_8x8(dst, stride, &dc); +} + +void vpx_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_8x8(dst, stride, &dc_dup); +} + +//------------------------------------------------------------------------------ +// DC 16x16 + +static INLINE __m128i dc_sum_16(const uint16_t *ref) { + const __m128i sum_lo = dc_sum_8(ref); + const __m128i sum_hi = dc_sum_8(ref + 8); + return _mm_add_epi16(sum_lo, sum_hi); +} + +static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < 16; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + _mm_store_si128((__m128i *)(dst + 8), dc_dup); + } +} + +void vpx_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_16x16(dst, stride, &dc); +} + +void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16x16(dst, stride, &dc); +} + +void vpx_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16x16(dst, stride, &dc_dup); +} + +//------------------------------------------------------------------------------ +// DC 32x32 + +static INLINE __m128i dc_sum_32(const uint16_t *ref) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sum_a = dc_sum_16(ref); + const __m128i sum_b = dc_sum_16(ref + 16); + // 12 bit bd will outrange, so expand to 32 bit before adding final total + return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero), + _mm_unpacklo_epi16(sum_b, zero)); +} + +static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < 32; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + _mm_store_si128((__m128i *)(dst + 8), dc_dup); + _mm_store_si128((__m128i *)(dst + 16), dc_dup); + _mm_store_si128((__m128i *)(dst + 24), dc_dup); + } +} + +void vpx_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(left); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)above; + (void)bd; + dc_store_32x32(dst, stride, &dc); +} + +void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(above); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)left; + (void)bd; + dc_store_32x32(dst, stride, &dc); +} + +void vpx_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_32x32(dst, stride, &dc_dup); +} + +// ----------------------------------------------------------------------------- +/* +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +*/ +static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, + const __m128i *z) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a = _mm_avg_epu16(*x, *z); + const __m128i b = + _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); + return _mm_avg_epu16(b, *y); +} + +void vpx_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4)); + const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0); + const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1); + const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2); + const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2); + const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4); + const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0); + const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00); + const __m128i row0 = _mm_srli_si128(avg2, 6); + const __m128i row1 = _mm_srli_si128(avg3, 4); + const __m128i row2 = _mm_srli_si128(avg2, 4); + const __m128i row3 = _mm_srli_si128(avg3, 2); + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + + dst -= stride; + dst[0] = _mm_extract_epi16(avg3, 1); + dst[stride] = _mm_extract_epi16(avg3, 0); +} + +void vpx_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4)); + const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0); + const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1); + const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2); + const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2); + const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0); + const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC); + const __m128i row0 = _mm_srli_si128(avg3, 6); + const __m128i row1 = _mm_srli_si128(avg3, 4); + const __m128i row2 = _mm_srli_si128(avg3, 2); + const __m128i row3 = avg3; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void vpx_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5)); + const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0); + const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1); + const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2); + const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3); + const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2); + const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4); + const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00); + const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0); + const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3); + const __m128i row2 = _mm_srli_si128(row3, 4); + const __m128i row1 = _mm_srli_si128(row3, 8); + const __m128i row0 = _mm_srli_si128(avg3, 4); + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst[0] = _mm_extract_epi16(avg2, 3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void vpx_highbd_d207_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i IJKL0000 = _mm_load_si128((const __m128i *)left); + const __m128i LLLL0000 = _mm_shufflelo_epi16(IJKL0000, 0xff); + const __m128i IJKLLLLL = _mm_unpacklo_epi64(IJKL0000, LLLL0000); + const __m128i JKLLLLL0 = _mm_srli_si128(IJKLLLLL, 2); + const __m128i KLLLLL00 = _mm_srli_si128(IJKLLLLL, 4); + const __m128i avg3 = avg3_epu16(&IJKLLLLL, &JKLLLLL0, &KLLLLL00); + const __m128i avg2 = _mm_avg_epu16(IJKLLLLL, JKLLLLL0); + const __m128i row0 = _mm_unpacklo_epi16(avg2, avg3); + const __m128i row1 = _mm_srli_si128(row0, 4); + const __m128i row2 = _mm_srli_si128(row0, 8); + const __m128i row3 = LLLL0000; + (void)above; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void vpx_highbd_d63_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above); + const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); + const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00); + const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGH0); + const __m128i row0 = avg2; + const __m128i row1 = avg3; + const __m128i row2 = _mm_srli_si128(avg2, 2); + const __m128i row3 = _mm_srli_si128(avg3, 2); + (void)left; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} diff --git a/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c b/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c new file mode 100644 index 000000000..b9dcef205 --- /dev/null +++ b/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c @@ -0,0 +1,930 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <tmmintrin.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +// ----------------------------------------------------------------------------- +/* +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +*/ +static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, + const __m128i *z) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a = _mm_avg_epu16(*x, *z); + const __m128i b = + _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); + return _mm_avg_epu16(b, *y); +} + +void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above); + const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); + const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00); + (void)left; + (void)bd; + _mm_storel_epi64((__m128i *)dst, avg3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6)); + dst[3] = above[7]; // aka H +} + +static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride, + __m128i *row, const __m128i *ar) { + *row = _mm_alignr_epi8(*ar, *row, 2); + _mm_store_si128((__m128i *)*dst, *row); + *dst += stride; +} + +void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff); + const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH); + const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2); + const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4); + __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH); + (void)left; + (void)bd; + _mm_store_si128((__m128i *)dst, avg3); + dst += stride; + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); +} + +static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride, + __m128i *row_0, __m128i *row_1, + const __m128i *ar) { + *row_0 = _mm_alignr_epi8(*row_1, *row_0, 2); + *row_1 = _mm_alignr_epi8(*ar, *row_1, 2); + _mm_store_si128((__m128i *)*dst, *row_0); + _mm_store_si128((__m128i *)(*dst + 8), *row_1); + *dst += stride; +} + +void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff); + const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(AR, A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(AR, A1, 4); + __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + (void)left; + (void)bd; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + dst += stride; + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); +} + +void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff); + const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_alignr_epi8(AR, A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_alignr_epi8(AR, A3, 4); + __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + int i; + (void)left; + (void)bd; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + _mm_store_si128((__m128i *)(dst + 16), avg3_2); + _mm_store_si128((__m128i *)(dst + 24), avg3_3); + dst += stride; + for (i = 1; i < 32; ++i) { + avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2); + avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2); + avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2); + avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2); + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + _mm_store_si128((__m128i *)(dst + 16), avg3_2); + _mm_store_si128((__m128i *)(dst + 24), avg3_3); + dst += stride; + } +} + +DECLARE_ALIGNED(16, static const uint8_t, rotate_right_epu16[16]) = { + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1 +}; + +static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) { + *a = _mm_shuffle_epi8(*a, *rotrw); + return *a; +} + +void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i IXABCDEF = + _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF); + const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2); + __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0); + __m128i rowa = avg2; + __m128i rowb = avg3; + int i; + (void)bd; + for (i = 0; i < 8; i += 2) { + _mm_store_si128((__m128i *)dst, rowa); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb); + dst += stride; + rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); + rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14); + } +} + +void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14); + const __m128i C1 = _mm_alignr_epi8(B1, B0, 14); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2); + const __m128i L1_ = _mm_srli_si128(L1, 2); + __m128i rowa_0 = avg2_0; + __m128i rowa_1 = avg2_1; + __m128i rowb_0 = avg3_0; + __m128i rowb_1 = avg3_1; + __m128i avg3_left[2]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_); + avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_); + for (i = 0; i < 2; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; j += 2) { + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb_0); + _mm_store_si128((__m128i *)(dst + 8), rowb_1); + dst += stride; + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14); + rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14); + } + } +} + +void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i avg2_2 = _mm_avg_epu16(A2, B2); + const __m128i avg2_3 = _mm_avg_epu16(A3, B3); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14); + const __m128i C1 = _mm_alignr_epi8(B1, B0, 14); + const __m128i C2 = _mm_alignr_epi8(B2, B1, 14); + const __m128i C3 = _mm_alignr_epi8(B3, B2, 14); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2); + const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2); + const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2); + const __m128i L3_ = _mm_srli_si128(L3, 2); + __m128i rowa_0 = avg2_0; + __m128i rowa_1 = avg2_1; + __m128i rowa_2 = avg2_2; + __m128i rowa_3 = avg2_3; + __m128i rowb_0 = avg3_0; + __m128i rowb_1 = avg3_1; + __m128i rowb_2 = avg3_2; + __m128i rowb_3 = avg3_3; + __m128i avg3_left[4]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_); + avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_); + avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_); + avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_); + for (i = 0; i < 4; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; j += 2) { + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + _mm_store_si128((__m128i *)(dst + 16), rowa_2); + _mm_store_si128((__m128i *)(dst + 24), rowa_3); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb_0); + _mm_store_si128((__m128i *)(dst + 8), rowb_1); + _mm_store_si128((__m128i *)(dst + 16), rowb_2); + _mm_store_si128((__m128i *)(dst + 24), rowb_3); + dst += stride; + rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); + rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14); + rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14); + rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14); + rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14); + } + } +} + +void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i AXIJKLMN = + _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14); + const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0); + __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); + __m128i rowa = avg3; + int i; + (void)bd; + for (i = 0; i < 8; ++i) { + rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa); + dst += stride; + } +} + +void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i B0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); + const __m128i C1 = _mm_srli_si128(B1, 2); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); + const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); + __m128i rowa_0 = avg3_0; + __m128i rowa_1 = avg3_1; + __m128i avg3_left[2]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); + avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); + for (i = 0; i < 2; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; ++j) { + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + dst += stride; + } + } +} + +void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i B0 = _mm_load_si128((const __m128i *)above); + const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); + const __m128i C1 = _mm_alignr_epi8(B2, B1, 2); + const __m128i C2 = _mm_alignr_epi8(B3, B2, 2); + const __m128i C3 = _mm_srli_si128(B3, 2); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); + const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); + const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14); + const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14); + __m128i rowa_0 = avg3_0; + __m128i rowa_1 = avg3_1; + __m128i rowa_2 = avg3_2; + __m128i rowa_3 = avg3_3; + __m128i avg3_left[4]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); + avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); + avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_); + avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_); + for (i = 0; i < 4; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; ++j) { + rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); + rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + _mm_store_si128((__m128i *)(dst + 16), rowa_2); + _mm_store_si128((__m128i *)(dst + 24), rowa_3); + dst += stride; + } + } +} + +void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2); + const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4); + const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i AXIJKLMN = + _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14); + const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); + const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO); + const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left); + const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left); + const __m128i row0 = + _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12); + const __m128i row1 = + _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12); + const __m128i row2 = + _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12); + const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12); + const __m128i row4 = + _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12); + const __m128i row5 = + _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12); + const __m128i row6 = + _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12); + const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12); + (void)bd; + _mm_store_si128((__m128i *)dst, row0); + dst += stride; + _mm_store_si128((__m128i *)dst, row1); + dst += stride; + _mm_store_si128((__m128i *)dst, row2); + dst += stride; + _mm_store_si128((__m128i *)dst, row3); + dst += stride; + _mm_store_si128((__m128i *)dst, row4); + dst += stride; + _mm_store_si128((__m128i *)dst, row5); + dst += stride; + _mm_store_si128((__m128i *)dst, row6); + dst += stride; + _mm_store_si128((__m128i *)dst, row7); +} + +void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_srli_si128(A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_srli_si128(A1, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); + const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); + const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); + const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); + const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); + __m128i row_0 = avg3_0; + __m128i row_1 = avg3_1; + __m128i avg2_avg3_left[2][2]; + int i, j; + (void)bd; + + avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); + + for (j = 0; j < 2; ++j) { + for (i = 0; i < 2; ++i) { + const __m128i avg2_avg3 = avg2_avg3_left[j][i]; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + } + } +} + +void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_srli_si128(A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_srli_si128(A3, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); + const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); + const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12); + const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12); + const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); + const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); + const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2); + const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3); + const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); + const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); + const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2); + const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3); + __m128i row_0 = avg3_0; + __m128i row_1 = avg3_1; + __m128i row_2 = avg3_2; + __m128i row_3 = avg3_3; + __m128i avg2_avg3_left[4][2]; + int i, j; + (void)bd; + + avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2); + avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2); + avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3); + avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3); + + for (j = 0; j < 4; ++j) { + for (i = 0; i < 2; ++i) { + const __m128i avg2_avg3 = avg2_avg3_left[j][i]; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + } + } +} + +static INLINE void d207_store_4x8(uint16_t **dst, const ptrdiff_t stride, + const __m128i *a, const __m128i *b) { + _mm_store_si128((__m128i *)*dst, *a); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12)); + *dst += stride; +} + +void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)left); + const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff); + const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH); + const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2); + const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH); + const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH); + const __m128i out_a = _mm_unpacklo_epi16(avg2, avg3); + const __m128i out_b = _mm_unpackhi_epi16(avg2, avg3); + (void)above; + (void)bd; + d207_store_4x8(&dst, stride, &out_a, &out_b); + d207_store_4x8(&dst, stride, &out_b, &HHHHHHHH); +} + +static INLINE void d207_store_4x16(uint16_t **dst, const ptrdiff_t stride, + const __m128i *a, const __m128i *b, + const __m128i *c) { + _mm_store_si128((__m128i *)*dst, *a); + _mm_store_si128((__m128i *)(*dst + 8), *b); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12)); + *dst += stride; +} + +void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)left); + const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff); + const __m128i LR = _mm_unpackhi_epi64(LR0, LR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(LR, A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(LR, A1, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0); + const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0); + const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1); + const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1); + (void)above; + (void)bd; + d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c); + d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d); + d207_store_4x16(&dst, stride, &out_c, &out_d, &LR); + d207_store_4x16(&dst, stride, &out_d, &LR, &LR); +} + +static INLINE void d207_store_4x32(uint16_t **dst, const ptrdiff_t stride, + const __m128i *a, const __m128i *b, + const __m128i *c, const __m128i *d, + const __m128i *e) { + _mm_store_si128((__m128i *)*dst, *a); + _mm_store_si128((__m128i *)(*dst + 8), *b); + _mm_store_si128((__m128i *)(*dst + 16), *c); + _mm_store_si128((__m128i *)(*dst + 24), *d); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4)); + _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 4)); + _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 4)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8)); + _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 8)); + _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 8)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12)); + _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 12)); + _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 12)); + *dst += stride; +} + +void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)left); + const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i LR0 = _mm_shufflehi_epi16(A3, 0xff); + const __m128i LR = _mm_unpackhi_epi64(LR0, LR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_alignr_epi8(LR, A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_alignr_epi8(LR, A3, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i avg2_2 = _mm_avg_epu16(A2, B2); + const __m128i avg2_3 = _mm_avg_epu16(A3, B3); + const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0); + const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0); + const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1); + const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1); + const __m128i out_e = _mm_unpacklo_epi16(avg2_2, avg3_2); + const __m128i out_f = _mm_unpackhi_epi16(avg2_2, avg3_2); + const __m128i out_g = _mm_unpacklo_epi16(avg2_3, avg3_3); + const __m128i out_h = _mm_unpackhi_epi16(avg2_3, avg3_3); + (void)above; + (void)bd; + d207_store_4x32(&dst, stride, &out_a, &out_b, &out_c, &out_d, &out_e); + d207_store_4x32(&dst, stride, &out_b, &out_c, &out_d, &out_e, &out_f); + d207_store_4x32(&dst, stride, &out_c, &out_d, &out_e, &out_f, &out_g); + d207_store_4x32(&dst, stride, &out_d, &out_e, &out_f, &out_g, &out_h); + d207_store_4x32(&dst, stride, &out_e, &out_f, &out_g, &out_h, &LR); + d207_store_4x32(&dst, stride, &out_f, &out_g, &out_h, &LR, &LR); + d207_store_4x32(&dst, stride, &out_g, &out_h, &LR, &LR, &LR); + d207_store_4x32(&dst, stride, &out_h, &LR, &LR, &LR, &LR); +} + +static INLINE void d63_store_4x8(uint16_t **dst, const ptrdiff_t stride, + __m128i *a, __m128i *b, const __m128i *ar) { + _mm_store_si128((__m128i *)*dst, *a); + *dst += stride; + _mm_store_si128((__m128i *)*dst, *b); + *dst += stride; + *a = _mm_alignr_epi8(*ar, *a, 2); + *b = _mm_alignr_epi8(*ar, *b, 2); + _mm_store_si128((__m128i *)*dst, *a); + *dst += stride; + _mm_store_si128((__m128i *)*dst, *b); + *dst += stride; + *a = _mm_alignr_epi8(*ar, *a, 2); + *b = _mm_alignr_epi8(*ar, *b, 2); +} + +void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff); + const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH); + const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2); + const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4); + __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH); + __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH); + (void)left; + (void)bd; + d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH); + d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH); +} + +void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff); + const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(AR, A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(AR, A1, 4); + __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + __m128i avg2_0 = _mm_avg_epu16(A0, B0); + __m128i avg2_1 = _mm_avg_epu16(A1, B1); + int i; + (void)left; + (void)bd; + for (i = 0; i < 14; i += 2) { + _mm_store_si128((__m128i *)dst, avg2_0); + _mm_store_si128((__m128i *)(dst + 8), avg2_1); + dst += stride; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + dst += stride; + avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2); + avg2_1 = _mm_alignr_epi8(AR, avg2_1, 2); + avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2); + avg3_1 = _mm_alignr_epi8(AR, avg3_1, 2); + } + _mm_store_si128((__m128i *)dst, avg2_0); + _mm_store_si128((__m128i *)(dst + 8), avg2_1); + dst += stride; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); +} + +void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff); + const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_alignr_epi8(AR, A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_alignr_epi8(AR, A3, 4); + __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + __m128i avg2_0 = _mm_avg_epu16(A0, B0); + __m128i avg2_1 = _mm_avg_epu16(A1, B1); + __m128i avg2_2 = _mm_avg_epu16(A2, B2); + __m128i avg2_3 = _mm_avg_epu16(A3, B3); + int i; + (void)left; + (void)bd; + for (i = 0; i < 30; i += 2) { + _mm_store_si128((__m128i *)dst, avg2_0); + _mm_store_si128((__m128i *)(dst + 8), avg2_1); + _mm_store_si128((__m128i *)(dst + 16), avg2_2); + _mm_store_si128((__m128i *)(dst + 24), avg2_3); + dst += stride; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + _mm_store_si128((__m128i *)(dst + 16), avg3_2); + _mm_store_si128((__m128i *)(dst + 24), avg3_3); + dst += stride; + avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2); + avg2_1 = _mm_alignr_epi8(avg2_2, avg2_1, 2); + avg2_2 = _mm_alignr_epi8(avg2_3, avg2_2, 2); + avg2_3 = _mm_alignr_epi8(AR, avg2_3, 2); + avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2); + avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2); + avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2); + avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2); + } + _mm_store_si128((__m128i *)dst, avg2_0); + _mm_store_si128((__m128i *)(dst + 8), avg2_1); + _mm_store_si128((__m128i *)(dst + 16), avg2_2); + _mm_store_si128((__m128i *)(dst + 24), avg2_3); + dst += stride; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + _mm_store_si128((__m128i *)(dst + 16), avg3_2); + _mm_store_si128((__m128i *)(dst + 24), avg3_3); +} diff --git a/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h index ea100c6e1..e0f749552 100644 --- a/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h +++ b/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h @@ -12,59 +12,389 @@ #define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ #include <emmintrin.h> // SSE2 + #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/inv_txfm.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" -static INLINE __m128i add_dc_clamp(const __m128i *const min, - const __m128i *const max, - const __m128i *const dc, - const __m128i *const in) { - __m128i out; - out = _mm_adds_epi16(*in, *dc); - out = _mm_max_epi16(out, *min); - out = _mm_min_epi16(out, *max); - return out; +static INLINE void extend_64bit(const __m128i in, + __m128i *const out /*out[2]*/) { + out[0] = _mm_unpacklo_epi32(in, in); // 0, 0, 1, 1 + out[1] = _mm_unpackhi_epi32(in, in); // 2, 2, 3, 3 } -static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input, - uint16_t *dest, int stride, int bd, - const int size) { - const __m128i zero = _mm_setzero_si128(); +static INLINE __m128i wraplow_16bit_shift4(const __m128i in0, const __m128i in1, + const __m128i rounding) { + __m128i temp[2]; + temp[0] = _mm_add_epi32(in0, rounding); + temp[1] = _mm_add_epi32(in1, rounding); + temp[0] = _mm_srai_epi32(temp[0], 4); + temp[1] = _mm_srai_epi32(temp[1], 4); + return _mm_packs_epi32(temp[0], temp[1]); +} + +static INLINE __m128i wraplow_16bit_shift5(const __m128i in0, const __m128i in1, + const __m128i rounding) { + __m128i temp[2]; + temp[0] = _mm_add_epi32(in0, rounding); + temp[1] = _mm_add_epi32(in1, rounding); + temp[0] = _mm_srai_epi32(temp[0], 5); + temp[1] = _mm_srai_epi32(temp[1], 5); + return _mm_packs_epi32(temp[0], temp[1]); +} + +static INLINE __m128i dct_const_round_shift_64bit(const __m128i in) { + const __m128i t = + _mm_add_epi64(in, pair_set_epi32(DCT_CONST_ROUNDING << 2, 0)); + return _mm_srli_si128(t, 2); +} + +static INLINE __m128i pack_4(const __m128i in0, const __m128i in1) { + const __m128i t0 = _mm_unpacklo_epi32(in0, in1); // 0, 2 + const __m128i t1 = _mm_unpackhi_epi32(in0, in1); // 1, 3 + return _mm_unpacklo_epi32(t0, t1); // 0, 1, 2, 3 +} + +static INLINE void abs_extend_64bit_sse2(const __m128i in, + __m128i *const out /*out[2]*/, + __m128i *const sign /*sign[2]*/) { + sign[0] = _mm_srai_epi32(in, 31); + out[0] = _mm_xor_si128(in, sign[0]); + out[0] = _mm_sub_epi32(out[0], sign[0]); + sign[1] = _mm_unpackhi_epi32(sign[0], sign[0]); // 64-bit sign of 2, 3 + sign[0] = _mm_unpacklo_epi32(sign[0], sign[0]); // 64-bit sign of 0, 1 + out[1] = _mm_unpackhi_epi32(out[0], out[0]); // 2, 3 + out[0] = _mm_unpacklo_epi32(out[0], out[0]); // 0, 1 +} + +// Note: cospi must be non negative. +static INLINE __m128i multiply_apply_sign_sse2(const __m128i in, + const __m128i sign, + const __m128i cospi) { + __m128i out = _mm_mul_epu32(in, cospi); + out = _mm_xor_si128(out, sign); + return _mm_sub_epi64(out, sign); +} + +// Note: c must be non negative. +static INLINE __m128i multiplication_round_shift_sse2( + const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/, + const int c) { + const __m128i pair_c = pair_set_epi32(c << 2, 0); + __m128i t0, t1; + + assert(c >= 0); + t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c); + t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c); + t0 = dct_const_round_shift_64bit(t0); + t1 = dct_const_round_shift_64bit(t1); + + return pack_4(t0, t1); +} + +// Note: c must be non negative. +static INLINE __m128i multiplication_neg_round_shift_sse2( + const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/, + const int c) { + const __m128i pair_c = pair_set_epi32(c << 2, 0); + __m128i t0, t1; + + assert(c >= 0); + t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c); + t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c); + t0 = _mm_sub_epi64(_mm_setzero_si128(), t0); + t1 = _mm_sub_epi64(_mm_setzero_si128(), t1); + t0 = dct_const_round_shift_64bit(t0); + t1 = dct_const_round_shift_64bit(t1); + + return pack_4(t0, t1); +} + +// Note: c0 and c1 must be non negative. +static INLINE void highbd_butterfly_sse2(const __m128i in0, const __m128i in1, + const int c0, const int c1, + __m128i *const out0, + __m128i *const out1) { + const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0); + const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0); + __m128i temp1[4], temp2[4], sign1[2], sign2[2]; + + assert(c0 >= 0); + assert(c1 >= 0); + abs_extend_64bit_sse2(in0, temp1, sign1); + abs_extend_64bit_sse2(in1, temp2, sign2); + temp1[2] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c1); + temp1[3] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c1); + temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c0); + temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c0); + temp2[2] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c0); + temp2[3] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c0); + temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c1); + temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c1); + temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]); + temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); + temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); + temp2[1] = _mm_add_epi64(temp1[3], temp2[3]); + temp1[0] = dct_const_round_shift_64bit(temp1[0]); + temp1[1] = dct_const_round_shift_64bit(temp1[1]); + temp2[0] = dct_const_round_shift_64bit(temp2[0]); + temp2[1] = dct_const_round_shift_64bit(temp2[1]); + *out0 = pack_4(temp1[0], temp1[1]); + *out1 = pack_4(temp2[0], temp2[1]); +} + +// Note: c0 and c1 must be non negative. +static INLINE void highbd_partial_butterfly_sse2(const __m128i in, const int c0, + const int c1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp[2], sign[2]; + + assert(c0 >= 0); + assert(c1 >= 0); + abs_extend_64bit_sse2(in, temp, sign); + *out0 = multiplication_round_shift_sse2(temp, sign, c0); + *out1 = multiplication_round_shift_sse2(temp, sign, c1); +} + +// Note: c0 and c1 must be non negative. +static INLINE void highbd_partial_butterfly_neg_sse2(const __m128i in, + const int c0, const int c1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp[2], sign[2]; + + assert(c0 >= 0); + assert(c1 >= 0); + abs_extend_64bit_sse2(in, temp, sign); + *out0 = multiplication_neg_round_shift_sse2(temp, sign, c1); + *out1 = multiplication_round_shift_sse2(temp, sign, c0); +} + +static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0, + const __m128i in1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp1[2], temp2, sign[2]; + + temp2 = _mm_add_epi32(in0, in1); + abs_extend_64bit_sse2(temp2, temp1, sign); + *out0 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64); + temp2 = _mm_sub_epi32(in0, in1); + abs_extend_64bit_sse2(temp2, temp1, sign); + *out1 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64); +} + +// Only do addition and subtraction butterfly, size = 16, 32 +static INLINE void highbd_add_sub_butterfly(const __m128i *in, __m128i *out, + int size) { + int i = 0; + const int num = size >> 1; + const int bound = size - 1; + while (i < num) { + out[i] = _mm_add_epi32(in[i], in[bound - i]); + out[bound - i] = _mm_sub_epi32(in[i], in[bound - i]); + i++; + } +} + +static INLINE void highbd_idct8_stage4(const __m128i *const in, + __m128i *const out) { + out[0] = _mm_add_epi32(in[0], in[7]); + out[1] = _mm_add_epi32(in[1], in[6]); + out[2] = _mm_add_epi32(in[2], in[5]); + out[3] = _mm_add_epi32(in[3], in[4]); + out[4] = _mm_sub_epi32(in[3], in[4]); + out[5] = _mm_sub_epi32(in[2], in[5]); + out[6] = _mm_sub_epi32(in[1], in[6]); + out[7] = _mm_sub_epi32(in[0], in[7]); +} + +static INLINE void highbd_idct8x8_final_round(__m128i *const io) { + io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16)); + io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16)); + io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16)); + io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16)); + io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16)); + io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16)); + io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16)); + io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16)); +} + +static INLINE void highbd_idct16_4col_stage7(const __m128i *const in, + __m128i *const out) { + out[0] = _mm_add_epi32(in[0], in[15]); + out[1] = _mm_add_epi32(in[1], in[14]); + out[2] = _mm_add_epi32(in[2], in[13]); + out[3] = _mm_add_epi32(in[3], in[12]); + out[4] = _mm_add_epi32(in[4], in[11]); + out[5] = _mm_add_epi32(in[5], in[10]); + out[6] = _mm_add_epi32(in[6], in[9]); + out[7] = _mm_add_epi32(in[7], in[8]); + out[8] = _mm_sub_epi32(in[7], in[8]); + out[9] = _mm_sub_epi32(in[6], in[9]); + out[10] = _mm_sub_epi32(in[5], in[10]); + out[11] = _mm_sub_epi32(in[4], in[11]); + out[12] = _mm_sub_epi32(in[3], in[12]); + out[13] = _mm_sub_epi32(in[2], in[13]); + out[14] = _mm_sub_epi32(in[1], in[14]); + out[15] = _mm_sub_epi32(in[0], in[15]); +} + +static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1, + const int bd) { + const __m128i zero = _mm_set1_epi16(0); // Faster than _mm_set1_epi16((1 << bd) - 1). const __m128i one = _mm_set1_epi16(1); const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); + __m128i d; + + d = _mm_adds_epi16(in0, in1); + d = _mm_max_epi16(d, zero); + d = _mm_min_epi16(d, max); + + return d; +} + +static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input, + uint16_t *dest, int stride, int bd, + const int size) { int a1, i, j; tran_low_t out; __m128i dc, d; - out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, (size == 8) ? 5 : 6); dc = _mm_set1_epi16(a1); for (i = 0; i < size; ++i) { - for (j = 0; j < (size >> 3); ++j) { - d = _mm_load_si128((const __m128i *)(&dest[j * 8])); - d = add_dc_clamp(&zero, &max, &dc, &d); - _mm_store_si128((__m128i *)(&dest[j * 8]), d); + for (j = 0; j < size; j += 8) { + d = _mm_load_si128((const __m128i *)(&dest[j])); + d = add_clamp(d, dc, bd); + _mm_store_si128((__m128i *)(&dest[j]), d); } dest += stride; } } -static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { - __m128i ubounded, retval; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); - ubounded = _mm_cmpgt_epi16(value, max); - retval = _mm_andnot_si128(ubounded, value); - ubounded = _mm_and_si128(ubounded, max); - retval = _mm_or_si128(retval, ubounded); - retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); - return retval; +static INLINE void recon_and_store_4(const __m128i in, uint16_t *const dest, + const int bd) { + __m128i d; + + d = _mm_loadl_epi64((const __m128i *)dest); + d = add_clamp(d, in, bd); + _mm_storel_epi64((__m128i *)dest, d); +} + +static INLINE void recon_and_store_4x2(const __m128i in, uint16_t *const dest, + const int stride, const int bd) { + __m128i d; + + d = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride)); + d = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(d), (const __m64 *)(dest + 1 * stride))); + d = add_clamp(d, in, bd); + _mm_storel_epi64((__m128i *)(dest + 0 * stride), d); + _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d)); +} + +static INLINE void recon_and_store_4x4(const __m128i *const in, uint16_t *dest, + const int stride, const int bd) { + recon_and_store_4x2(in[0], dest, stride, bd); + dest += 2 * stride; + recon_and_store_4x2(in[1], dest, stride, bd); +} + +static INLINE void recon_and_store_8(const __m128i in, uint16_t **const dest, + const int stride, const int bd) { + __m128i d; + + d = _mm_load_si128((const __m128i *)(*dest)); + d = add_clamp(d, in, bd); + _mm_store_si128((__m128i *)(*dest), d); + *dest += stride; +} + +static INLINE void recon_and_store_8x8(const __m128i *const in, uint16_t *dest, + const int stride, const int bd) { + recon_and_store_8(in[0], &dest, stride, bd); + recon_and_store_8(in[1], &dest, stride, bd); + recon_and_store_8(in[2], &dest, stride, bd); + recon_and_store_8(in[3], &dest, stride, bd); + recon_and_store_8(in[4], &dest, stride, bd); + recon_and_store_8(in[5], &dest, stride, bd); + recon_and_store_8(in[6], &dest, stride, bd); + recon_and_store_8(in[7], &dest, stride, bd); +} + +static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) { + const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0)); + const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4)); + return _mm_packs_epi32(t0, t1); +} + +static INLINE void highbd_load_pack_transpose_32bit_8x8(const tran_low_t *input, + const int stride, + __m128i *const in) { + in[0] = load_pack_8_32bit(input + 0 * stride); + in[1] = load_pack_8_32bit(input + 1 * stride); + in[2] = load_pack_8_32bit(input + 2 * stride); + in[3] = load_pack_8_32bit(input + 3 * stride); + in[4] = load_pack_8_32bit(input + 4 * stride); + in[5] = load_pack_8_32bit(input + 5 * stride); + in[6] = load_pack_8_32bit(input + 6 * stride); + in[7] = load_pack_8_32bit(input + 7 * stride); + transpose_16bit_8x8(in, in); +} + +static INLINE void highbd_load_transpose_32bit_8x4(const tran_low_t *input, + const int stride, + __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride + 0)); + in[1] = _mm_load_si128((const __m128i *)(input + 0 * stride + 4)); + in[2] = _mm_load_si128((const __m128i *)(input + 1 * stride + 0)); + in[3] = _mm_load_si128((const __m128i *)(input + 1 * stride + 4)); + in[4] = _mm_load_si128((const __m128i *)(input + 2 * stride + 0)); + in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride + 4)); + in[6] = _mm_load_si128((const __m128i *)(input + 3 * stride + 0)); + in[7] = _mm_load_si128((const __m128i *)(input + 3 * stride + 4)); + transpose_32bit_8x4(in, in); +} + +static INLINE void highbd_load_transpose_32bit_4x4(const tran_low_t *input, + const int stride, + __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + transpose_32bit_4x4(in, in); +} + +static INLINE void highbd_write_buffer_8(uint16_t *dest, const __m128i in, + const int bd) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + __m128i out; + + out = _mm_adds_epi16(in, final_rounding); + out = _mm_srai_epi16(out, 6); + recon_and_store_8(out, &dest, 0, bd); +} + +static INLINE void highbd_write_buffer_4(uint16_t *const dest, const __m128i in, + const int bd) { + const __m128i final_rounding = _mm_set1_epi32(1 << 5); + __m128i out; + + out = _mm_add_epi32(in, final_rounding); + out = _mm_srai_epi32(out, 6); + out = _mm_packs_epi32(out, out); + recon_and_store_4(out, dest, bd); } #endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ diff --git a/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h new file mode 100644 index 000000000..9c8eef40f --- /dev/null +++ b/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ +#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ + +#include <smmintrin.h> // SSE4.1 + +#include "./vpx_config.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" + +static INLINE __m128i multiplication_round_shift_sse4_1( + const __m128i *const in /*in[2]*/, const int c) { + const __m128i pair_c = pair_set_epi32(c * 4, 0); + __m128i t0, t1; + + t0 = _mm_mul_epi32(in[0], pair_c); + t1 = _mm_mul_epi32(in[1], pair_c); + t0 = dct_const_round_shift_64bit(t0); + t1 = dct_const_round_shift_64bit(t1); + + return pack_4(t0, t1); +} + +static INLINE void highbd_butterfly_sse4_1(const __m128i in0, const __m128i in1, + const int c0, const int c1, + __m128i *const out0, + __m128i *const out1) { + const __m128i pair_c0 = pair_set_epi32(4 * c0, 0); + const __m128i pair_c1 = pair_set_epi32(4 * c1, 0); + __m128i temp1[4], temp2[4]; + + extend_64bit(in0, temp1); + extend_64bit(in1, temp2); + temp1[2] = _mm_mul_epi32(temp1[0], pair_c1); + temp1[3] = _mm_mul_epi32(temp1[1], pair_c1); + temp1[0] = _mm_mul_epi32(temp1[0], pair_c0); + temp1[1] = _mm_mul_epi32(temp1[1], pair_c0); + temp2[2] = _mm_mul_epi32(temp2[0], pair_c0); + temp2[3] = _mm_mul_epi32(temp2[1], pair_c0); + temp2[0] = _mm_mul_epi32(temp2[0], pair_c1); + temp2[1] = _mm_mul_epi32(temp2[1], pair_c1); + temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]); + temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); + temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); + temp2[1] = _mm_add_epi64(temp1[3], temp2[3]); + temp1[0] = dct_const_round_shift_64bit(temp1[0]); + temp1[1] = dct_const_round_shift_64bit(temp1[1]); + temp2[0] = dct_const_round_shift_64bit(temp2[0]); + temp2[1] = dct_const_round_shift_64bit(temp2[1]); + *out0 = pack_4(temp1[0], temp1[1]); + *out1 = pack_4(temp2[0], temp2[1]); +} + +static INLINE void highbd_butterfly_cospi16_sse4_1(const __m128i in0, + const __m128i in1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp1[2], temp2; + + temp2 = _mm_add_epi32(in0, in1); + extend_64bit(temp2, temp1); + *out0 = multiplication_round_shift_sse4_1(temp1, cospi_16_64); + temp2 = _mm_sub_epi32(in0, in1); + extend_64bit(temp2, temp1); + *out1 = multiplication_round_shift_sse4_1(temp1, cospi_16_64); +} + +static INLINE void highbd_partial_butterfly_sse4_1(const __m128i in, + const int c0, const int c1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp[2]; + + extend_64bit(in, temp); + *out0 = multiplication_round_shift_sse4_1(temp, c0); + *out1 = multiplication_round_shift_sse4_1(temp, c1); +} + +#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ diff --git a/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c b/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c index 8670b2895..ec22db9f4 100644 --- a/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c +++ b/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c @@ -12,7 +12,6 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" -#include "vpx_ports/emmintrin_compat.h" static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { __m128i ubounded; diff --git a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index 2362476c1..cedf98aff 100644 --- a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <assert.h> #include <emmintrin.h> #include "vpx_dsp/vpx_dsp_common.h" @@ -37,54 +38,54 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); (void)scan; + (void)skip_block; + assert(!skip_block); memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = ((int)count / 4) - 1; i >= 0; i--) { - __m128i coeffs, cmp1, cmp2; - int test; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); - cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); - cmp1 = _mm_and_si128(cmp1, cmp2); - test = _mm_movemask_epi8(cmp1); - if (test == 0xffff) - non_zero_regs--; - else - break; - } + // Pre-scan pass + for (i = ((int)count / 4) - 1; i >= 0; i--) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (test == 0xffff) + non_zero_regs--; + else + break; + } - // Quantization pass: - for (i = 0; i < non_zero_regs; i++) { - __m128i coeffs, coeffs_sign, tmp1, tmp2; - int test; - int abs_coeff[4]; - int coeff_sign[4]; - - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - coeffs_sign = _mm_srai_epi32(coeffs, 31); - coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); - tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); - tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); - tmp1 = _mm_or_si128(tmp1, tmp2); - test = _mm_movemask_epi8(tmp1); - _mm_storeu_si128((__m128i *)abs_coeff, coeffs); - _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); - - for (j = 0; j < 4; j++) { - if (test & (1 << (4 * j))) { - int k = 4 * i + j; - const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; - const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; - const uint32_t abs_qcoeff = - (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); - qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; - dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; - if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; - } + // Quantization pass: + for (i = 0; i < non_zero_regs; i++) { + __m128i coeffs, coeffs_sign, tmp1, tmp2; + int test; + int abs_coeff[4]; + int coeff_sign[4]; + + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + coeffs_sign = _mm_srai_epi32(coeffs, 31); + coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); + tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); + tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); + tmp1 = _mm_or_si128(tmp1, tmp2); + test = _mm_movemask_epi8(tmp1); + _mm_storeu_si128((__m128i *)abs_coeff, coeffs); + _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); + + for (j = 0; j < 4; j++) { + if (test & (1 << (4 * j))) { + int k = 4 * i + j; + const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; + const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; + const uint32_t abs_qcoeff = + (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); + qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; + dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; + if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; } } } @@ -105,6 +106,9 @@ void vpx_highbd_quantize_b_32x32_sse2( const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); (void)scan; + (void)skip_block; + assert(!skip_block); + zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); zbins[1] = _mm_set1_epi32(zbin1_tmp); @@ -116,38 +120,35 @@ void vpx_highbd_quantize_b_32x32_sse2( memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs / 4; i++) { - __m128i coeffs, cmp1, cmp2; - int test; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); - cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); - cmp1 = _mm_and_si128(cmp1, cmp2); - test = _mm_movemask_epi8(cmp1); - if (!(test & 0xf)) idx_arr[idx++] = i * 4; - if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; - if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; - if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; - } + // Pre-scan pass + for (i = 0; i < n_coeffs / 4; i++) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (!(test & 0xf)) idx_arr[idx++] = i * 4; + if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; + if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; + if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; + } - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = idx_arr[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; - } + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = idx_arr[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; } *eob_ptr = eob + 1; } diff --git a/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm index 30ee81b68..d9a6932e0 100644 --- a/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm +++ b/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -72,7 +72,7 @@ SECTION .text paddd m6, m4 mov r1, ssem ; r1 = unsigned int *sse movd [r1], m7 ; store sse - movd rax, m6 ; store sum as return value + movd eax, m6 ; store sum as return value %endif RET %endmacro diff --git a/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm b/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm index 923418a99..e646767e1 100644 --- a/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm +++ b/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm @@ -11,6 +11,8 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;unsigned int vpx_highbd_calc16x16var_sse2 ;( ; unsigned char * src_ptr, diff --git a/libvpx/vpx_dsp/x86/intrapred_sse2.asm b/libvpx/vpx_dsp/x86/intrapred_sse2.asm index c18095c28..61af6236e 100644 --- a/libvpx/vpx_dsp/x86/intrapred_sse2.asm +++ b/libvpx/vpx_dsp/x86/intrapred_sse2.asm @@ -61,7 +61,7 @@ cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset psrlq m3, 8 movd [dstq+strideq ], m3 psrlq m0, 56 - movd tempq, m0 + movd tempd, m0 mov [dstq+strideq+3], tempb RESTORE_GOT diff --git a/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/libvpx/vpx_dsp/x86/inv_txfm_sse2.c index f75dab07a..f6e56b6f9 100644 --- a/libvpx/vpx_dsp/x86/inv_txfm_sse2.c +++ b/libvpx/vpx_dsp/x86/inv_txfm_sse2.c @@ -8,19 +8,29 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <emmintrin.h> // SSE2 + #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" #include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" +static INLINE void transpose_16bit_4(__m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); + const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); + + res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); + res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); +} + void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { const __m128i eight = _mm_set1_epi16(8); __m128i in[2]; // Rows - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8); + in[0] = load_input_data8(input); + in[1] = load_input_data8(input + 8); idct4_sse2(in); // Columns @@ -41,7 +51,7 @@ void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int a; __m128i dc_value, d[2]; - a = (int)dct_const_round_shift(input[0] * cospi_16_64); + a = (int)dct_const_round_shift((int16_t)input[0] * cospi_16_64); a = (int)dct_const_round_shift(a * cospi_16_64); a = ROUND_POWER_OF_TWO(a, 4); @@ -69,35 +79,19 @@ void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]); } -void idct4_sse2(__m128i *in) { +void idct4_sse2(__m128i *const in) { const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u[8], v[8]; + __m128i u[2]; - transpose_16bit_4x4(in); + transpose_16bit_4(in); // stage 1 u[0] = _mm_unpacklo_epi16(in[0], in[1]); u[1] = _mm_unpackhi_epi16(in[0], in[1]); - v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); - v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - - u[0] = _mm_packs_epi32(v[0], v[1]); - u[1] = _mm_packs_epi32(v[3], v[2]); + u[0] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_p16_m16, u[0]); + u[1] = idct_calc_wraplow_sse2(k__cospi_p08_p24, k__cospi_p24_m08, u[1]); // stage 2 in[0] = _mm_add_epi16(u[0], u[1]); @@ -105,7 +99,7 @@ void idct4_sse2(__m128i *in) { in[1] = _mm_shuffle_epi32(in[1], 0x4E); } -void iadst4_sse2(__m128i *in) { +void iadst4_sse2(__m128i *const in) { const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); @@ -115,7 +109,7 @@ void iadst4_sse2(__m128i *in) { const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); __m128i u[8], v[8], in7; - transpose_16bit_4x4(in); + transpose_16bit_4(in); in7 = _mm_srli_si128(in[1], 8); in7 = _mm_add_epi16(in7, in[0]); in7 = _mm_sub_epi16(in7, in[1]); @@ -154,215 +148,93 @@ void iadst4_sse2(__m128i *in) { in[1] = _mm_packs_epi32(u[2], u[3]); } -#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ - { \ - tmp0 = _mm_madd_epi16(lo_0, cst0); \ - tmp1 = _mm_madd_epi16(hi_0, cst0); \ - tmp2 = _mm_madd_epi16(lo_0, cst1); \ - tmp3 = _mm_madd_epi16(hi_0, cst1); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - res0 = _mm_packs_epi32(tmp0, tmp1); \ - res1 = _mm_packs_epi32(tmp2, tmp3); \ - } - -#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \ - out4, out5, out6, out7) \ - { \ - /* Stage1 */ \ - { \ - const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ - const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ - const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ - const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ - \ - MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1, \ - stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6) \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ - const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ - const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ - const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ - \ - MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \ - stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ - tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ - tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ - tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - } \ - \ - /* Stage4 */ \ - out0 = _mm_add_epi16(stp1_0, stp2_7); \ - out1 = _mm_add_epi16(stp1_1, stp1_6); \ - out2 = _mm_add_epi16(stp1_2, stp1_5); \ - out3 = _mm_add_epi16(stp1_3, stp2_4); \ - out4 = _mm_sub_epi16(stp1_3, stp2_4); \ - out5 = _mm_sub_epi16(stp1_2, stp1_5); \ - out6 = _mm_sub_epi16(stp1_1, stp1_6); \ - out7 = _mm_sub_epi16(stp1_0, stp2_7); \ - } +static INLINE void load_buffer_8x8(const tran_low_t *const input, + __m128i *const in) { + in[0] = load_input_data8(input + 0 * 8); + in[1] = load_input_data8(input + 1 * 8); + in[2] = load_input_data8(input + 2 * 8); + in[3] = load_input_data8(input + 3 * 8); + in[4] = load_input_data8(input + 4 * 8); + in[5] = load_input_data8(input + 5 * 8); + in[6] = load_input_data8(input + 6 * 8); + in[7] = load_input_data8(input + 7 * 8); +} void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i in[8]; int i; // Load input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); - in4 = load_input_data(input + 8 * 4); - in5 = load_input_data(input + 8 * 5); - in6 = load_input_data(input + 8 * 6); - in7 = load_input_data(input + 8 * 7); + load_buffer_8x8(input, in); // 2-D for (i = 0; i < 2; i++) { - // 8x8 Transpose is copied from vpx_fdct8x8_sse2() - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - - // 4-stage 1D idct8x8 - IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, - in6, in7); + idct8_sse2(in); } - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); - - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); + write_buffer_8x8(in, dest, stride); } -void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a; - - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 5); +void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i io[8]; - dc_value = _mm_set1_epi16(a); + io[0] = load_input_data4(input + 0 * 8); + io[1] = load_input_data4(input + 1 * 8); + io[2] = load_input_data4(input + 2 * 8); + io[3] = load_input_data4(input + 3 * 8); - RECON_AND_STORE(dest + 0 * stride, dc_value); - RECON_AND_STORE(dest + 1 * stride, dc_value); - RECON_AND_STORE(dest + 2 * stride, dc_value); - RECON_AND_STORE(dest + 3 * stride, dc_value); - RECON_AND_STORE(dest + 4 * stride, dc_value); - RECON_AND_STORE(dest + 5 * stride, dc_value); - RECON_AND_STORE(dest + 6 * stride, dc_value); - RECON_AND_STORE(dest + 7 * stride, dc_value); + idct8x8_12_add_kernel_sse2(io); + write_buffer_8x8(io, dest, stride); } -void idct8_sse2(__m128i *in) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); +static INLINE void recon_and_store_8_dual(uint8_t *const dest, + const __m128i in_x, + const int stride) { + const __m128i zero = _mm_setzero_si128(); + __m128i d0, d1; + + d0 = _mm_loadl_epi64((__m128i *)(dest + 0 * stride)); + d1 = _mm_loadl_epi64((__m128i *)(dest + 1 * stride)); + d0 = _mm_unpacklo_epi8(d0, zero); + d1 = _mm_unpacklo_epi8(d1, zero); + d0 = _mm_add_epi16(in_x, d0); + d1 = _mm_add_epi16(in_x, d1); + d0 = _mm_packus_epi16(d0, d1); + _mm_storel_epi64((__m128i *)(dest + 0 * stride), d0); + _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d0)); +} - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; +void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i dc_value; + tran_high_t a1; + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); + + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 5); + dc_value = _mm_set1_epi16((int16_t)a1); + + recon_and_store_8_dual(dest, dc_value, stride); + dest += 2 * stride; + recon_and_store_8_dual(dest, dc_value, stride); + dest += 2 * stride; + recon_and_store_8_dual(dest, dc_value, stride); + dest += 2 * stride; + recon_and_store_8_dual(dest, dc_value, stride); +} +void idct8_sse2(__m128i *const in) { // 8x8 Transpose is copied from vpx_fdct8x8_sse2() - TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0, - in1, in2, in3, in4, in5, in6, in7); + transpose_16bit_8x8(in, in); // 4-stage 1D idct8x8 - IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3], - in[4], in[5], in[6], in[7]); + idct8(in, in); } -void iadst8_sse2(__m128i *in) { +void iadst8_sse2(__m128i *const in) { const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); @@ -375,7 +247,7 @@ void iadst8_sse2(__m128i *in) { const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__const_0 = _mm_set1_epi16(0); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); @@ -386,7 +258,7 @@ void iadst8_sse2(__m128i *in) { __m128i in0, in1, in2, in3, in4, in5, in6, in7; // transpose - array_transpose_8x8(in, in); + transpose_16bit_8x8(in, in); // properly aligned for butterfly input in0 = in[7]; @@ -548,37 +420,10 @@ void iadst8_sse2(__m128i *in) { u2 = _mm_unpacklo_epi16(s6, s7); u3 = _mm_unpackhi_epi16(s6, s7); - v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); - v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); - v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); - v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); - v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); - v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); - v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); - v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); - - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); - - s2 = _mm_packs_epi32(v0, v1); - s3 = _mm_packs_epi32(v2, v3); - s6 = _mm_packs_epi32(v4, v5); - s7 = _mm_packs_epi32(v6, v7); + s2 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_p16); + s3 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_m16); + s6 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_p16); + s7 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_m16); in[0] = s0; in[1] = _mm_sub_epi16(k__const_0, s4); @@ -590,521 +435,133 @@ void iadst8_sse2(__m128i *in) { in[7] = _mm_sub_epi16(k__const_0, s1); } -void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); +static INLINE void idct16_load8x8(const tran_low_t *const input, + __m128i *const in) { + in[0] = load_input_data8(input + 0 * 16); + in[1] = load_input_data8(input + 1 * 16); + in[2] = load_input_data8(input + 2 * 16); + in[3] = load_input_data8(input + 3 * 16); + in[4] = load_input_data8(input + 4 * 16); + in[5] = load_input_data8(input + 5 * 16); + in[6] = load_input_data8(input + 6 * 16); + in[7] = load_input_data8(input + 7 * 16); +} - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - // Rows. Load 4-row input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); - - // 8x4 Transpose - TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); - // Stage1 - { - const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); - const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); - - tmp0 = _mm_madd_epi16(lo_17, stg1_0); - tmp2 = _mm_madd_epi16(lo_17, stg1_1); - tmp4 = _mm_madd_epi16(lo_35, stg1_2); - tmp6 = _mm_madd_epi16(lo_35, stg1_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp1_4 = _mm_packs_epi32(tmp0, tmp2); - stp1_5 = _mm_packs_epi32(tmp4, tmp6); - } +void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i l[16], r[16], out[16], *in; + int i; - // Stage2 - { - const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); - const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); - - tmp0 = _mm_madd_epi16(lo_04, stg2_0); - tmp2 = _mm_madd_epi16(lo_04, stg2_1); - tmp4 = _mm_madd_epi16(lo_26, stg2_2); - tmp6 = _mm_madd_epi16(lo_26, stg2_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp2_0 = _mm_packs_epi32(tmp0, tmp2); - stp2_2 = _mm_packs_epi32(tmp6, tmp4); - - tmp0 = _mm_add_epi16(stp1_4, stp1_5); - tmp1 = _mm_sub_epi16(stp1_4, stp1_5); - - stp2_4 = tmp0; - stp2_5 = _mm_unpacklo_epi64(tmp1, zero); - stp2_6 = _mm_unpackhi_epi64(tmp1, zero); + in = l; + for (i = 0; i < 2; i++) { + idct16_load8x8(input, in); + transpose_16bit_8x8(in, in); + idct16_load8x8(input + 8, in + 8); + transpose_16bit_8x8(in + 8, in + 8); + idct16_8col(in, in); + in = r; + input += 128; } - // Stage3 - { - const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(l + i, out); + transpose_16bit_8x8(r + i, out + 8); + idct16_8col(out, out); - tmp4 = _mm_add_epi16(stp2_0, stp2_2); - tmp6 = _mm_sub_epi16(stp2_0, stp2_2); + for (j = 0; j < 16; ++j) { + write_buffer_8x1(dest + j * stride, out[j]); + } - stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); - stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); + dest += 8; + } +} - tmp0 = _mm_madd_epi16(lo_56, stg3_0); - tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 +void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i in[16], temp[16], out[16]; + int i; - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + idct16_load8x8(input, in); + transpose_16bit_8x8(in, in); - stp1_5 = _mm_packs_epi32(tmp0, tmp2); + for (i = 8; i < 16; i++) { + in[i] = _mm_setzero_si128(); } + idct16_8col(in, temp); - // Stage4 - tmp0 = _mm_add_epi16(stp1_3, stp2_4); - tmp1 = _mm_add_epi16(stp1_2, stp1_5); - tmp2 = _mm_sub_epi16(stp1_3, stp2_4); - tmp3 = _mm_sub_epi16(stp1_2, stp1_5); - - TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) - - IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4, - in5, in6, in7); - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); - - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); -} + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(temp + i, in); + idct16_8col(in, out); -#define IDCT16 \ - /* Stage2 */ \ - { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ - const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ - const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ - const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ - const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1, \ - stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14) \ - \ - MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \ - stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ - const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ - const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ - \ - MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \ - stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6) \ - \ - stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - \ - stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ - const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ - const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1, \ - stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - } + for (j = 0; j < 16; ++j) { + write_buffer_8x1(dest + j * stride, out[j]); + } -#define IDCT16_10 \ - /* Stage2 */ \ - { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \ - stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11, \ - stp1_12_0) \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ - \ - MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \ - \ - stp1_9 = stp1_8_0; \ - stp1_10 = stp1_11; \ - \ - stp1_13 = stp1_12_0; \ - stp1_14 = stp1_15; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1) \ - stp2_5 = stp2_4; \ - stp2_6 = stp2_7; \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_2 = stp1_1; \ - stp1_3 = stp1_0; \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ + dest += 8; } +} -void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[16], l[16], r[16], *curr1; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_8_0, stp1_12_0; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; +void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i in[16], l[16]; int i; - curr1 = l; - for (i = 0; i < 2; i++) { - // 1-D idct - - // Load input data. - in[0] = load_input_data(input); - in[8] = load_input_data(input + 8 * 1); - in[1] = load_input_data(input + 8 * 2); - in[9] = load_input_data(input + 8 * 3); - in[2] = load_input_data(input + 8 * 4); - in[10] = load_input_data(input + 8 * 5); - in[3] = load_input_data(input + 8 * 6); - in[11] = load_input_data(input + 8 * 7); - in[4] = load_input_data(input + 8 * 8); - in[12] = load_input_data(input + 8 * 9); - in[5] = load_input_data(input + 8 * 10); - in[13] = load_input_data(input + 8 * 11); - in[6] = load_input_data(input + 8 * 12); - in[14] = load_input_data(input + 8 * 13); - in[7] = load_input_data(input + 8 * 14); - in[15] = load_input_data(input + 8 * 15); - - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - - IDCT16 - - // Stage7 - curr1[0] = _mm_add_epi16(stp2_0, stp1_15); - curr1[1] = _mm_add_epi16(stp2_1, stp1_14); - curr1[2] = _mm_add_epi16(stp2_2, stp2_13); - curr1[3] = _mm_add_epi16(stp2_3, stp2_12); - curr1[4] = _mm_add_epi16(stp2_4, stp2_11); - curr1[5] = _mm_add_epi16(stp2_5, stp2_10); - curr1[6] = _mm_add_epi16(stp2_6, stp1_9); - curr1[7] = _mm_add_epi16(stp2_7, stp1_8); - curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); - curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); - curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); - curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); - curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); - curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); - curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); - curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); - - curr1 = r; - input += 128; - } - for (i = 0; i < 2; i++) { + // First 1-D inverse DCT + // Load input data. + in[0] = load_input_data4(input + 0 * 16); + in[1] = load_input_data4(input + 1 * 16); + in[2] = load_input_data4(input + 2 * 16); + in[3] = load_input_data4(input + 3 * 16); + + idct16x16_10_pass1(in, l); + + // Second 1-D inverse transform, performed per 8x16 block + for (i = 0; i < 16; i += 8) { int j; - // 1-D idct - array_transpose_8x8(l + i * 8, in); - array_transpose_8x8(r + i * 8, in + 8); - - IDCT16 - - // 2-D - in[0] = _mm_add_epi16(stp2_0, stp1_15); - in[1] = _mm_add_epi16(stp2_1, stp1_14); - in[2] = _mm_add_epi16(stp2_2, stp2_13); - in[3] = _mm_add_epi16(stp2_3, stp2_12); - in[4] = _mm_add_epi16(stp2_4, stp2_11); - in[5] = _mm_add_epi16(stp2_5, stp2_10); - in[6] = _mm_add_epi16(stp2_6, stp1_9); - in[7] = _mm_add_epi16(stp2_7, stp1_8); - in[8] = _mm_sub_epi16(stp2_7, stp1_8); - in[9] = _mm_sub_epi16(stp2_6, stp1_9); - in[10] = _mm_sub_epi16(stp2_5, stp2_10); - in[11] = _mm_sub_epi16(stp2_4, stp2_11); - in[12] = _mm_sub_epi16(stp2_3, stp2_12); - in[13] = _mm_sub_epi16(stp2_2, stp2_13); - in[14] = _mm_sub_epi16(stp2_1, stp1_14); - in[15] = _mm_sub_epi16(stp2_0, stp1_15); + idct16x16_10_pass2(l + i, in); for (j = 0; j < 16; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); + write_buffer_8x1(dest + j * stride, in[j]); } dest += 8; } } +static INLINE void recon_and_store_16(uint8_t *const dest, const __m128i in_x) { + const __m128i zero = _mm_setzero_si128(); + __m128i d0, d1; + + d0 = _mm_load_si128((__m128i *)(dest)); + d1 = _mm_unpackhi_epi8(d0, zero); + d0 = _mm_unpacklo_epi8(d0, zero); + d0 = _mm_add_epi16(in_x, d0); + d1 = _mm_add_epi16(in_x, d1); + d0 = _mm_packus_epi16(d0, d1); + _mm_store_si128((__m128i *)(dest), d0); +} + void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a, i; - - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 6); + int i; + tran_high_t a1; + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); - dc_value = _mm_set1_epi16(a); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 6); + dc_value = _mm_set1_epi16((int16_t)a1); for (i = 0; i < 16; ++i) { - RECON_AND_STORE(dest + 0, dc_value); - RECON_AND_STORE(dest + 8, dc_value); + recon_and_store_16(dest, dc_value); dest += stride; } } -static void iadst16_8col(__m128i *in) { +static void iadst16_8col(__m128i *const in) { // perform 16x16 1-D ADST for 8 columns __m128i s[16], x[16], u[32], v[32]; const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); @@ -1132,8 +589,8 @@ static void iadst16_8col(__m128i *in) { const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); @@ -1505,1718 +962,371 @@ static void iadst16_8col(__m128i *in) { u[6] = _mm_unpacklo_epi16(s[14], s[15]); u[7] = _mm_unpackhi_epi16(s[14], s[15]); - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); - v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); - v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); - v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); - v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); - v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); - v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); - v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + in[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_m16); + in[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16); + in[4] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16); + in[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16); + in[6] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p16_p16); + in[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m16_p16); + in[5] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m16_m16); + in[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p16_m16); in[0] = s[0]; in[1] = _mm_sub_epi16(kZero, s[8]); in[2] = s[12]; in[3] = _mm_sub_epi16(kZero, s[4]); - in[4] = _mm_packs_epi32(v[4], v[5]); - in[5] = _mm_packs_epi32(v[12], v[13]); - in[6] = _mm_packs_epi32(v[8], v[9]); - in[7] = _mm_packs_epi32(v[0], v[1]); - in[8] = _mm_packs_epi32(v[2], v[3]); - in[9] = _mm_packs_epi32(v[10], v[11]); - in[10] = _mm_packs_epi32(v[14], v[15]); - in[11] = _mm_packs_epi32(v[6], v[7]); in[12] = s[5]; in[13] = _mm_sub_epi16(kZero, s[13]); in[14] = s[9]; in[15] = _mm_sub_epi16(kZero, s[1]); } -static void idct16_8col(__m128i *in) { - const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); - const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i v[16], u[16], s[16], t[16]; +void idct16_sse2(__m128i *const in0, __m128i *const in1) { + transpose_16bit_16x16(in0, in1); + idct16_8col(in0, in0); + idct16_8col(in1, in1); +} - // stage 1 - s[0] = in[0]; - s[1] = in[8]; - s[2] = in[4]; - s[3] = in[12]; - s[4] = in[2]; - s[5] = in[10]; - s[6] = in[6]; - s[7] = in[14]; - s[8] = in[1]; - s[9] = in[9]; - s[10] = in[5]; - s[11] = in[13]; - s[12] = in[3]; - s[13] = in[11]; - s[14] = in[7]; - s[15] = in[15]; +void iadst16_sse2(__m128i *const in0, __m128i *const in1) { + transpose_16bit_16x16(in0, in1); + iadst16_8col(in0); + iadst16_8col(in1); +} - // stage 2 - u[0] = _mm_unpacklo_epi16(s[8], s[15]); - u[1] = _mm_unpackhi_epi16(s[8], s[15]); - u[2] = _mm_unpacklo_epi16(s[9], s[14]); - u[3] = _mm_unpackhi_epi16(s[9], s[14]); - u[4] = _mm_unpacklo_epi16(s[10], s[13]); - u[5] = _mm_unpackhi_epi16(s[10], s[13]); - u[6] = _mm_unpacklo_epi16(s[11], s[12]); - u[7] = _mm_unpackhi_epi16(s[11], s[12]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); - v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); - v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); - v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); - v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); - v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); - v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); - v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); - v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); - v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); - v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); - v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); - v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); - v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); - v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); - v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[8] = _mm_packs_epi32(u[0], u[1]); - s[15] = _mm_packs_epi32(u[2], u[3]); - s[9] = _mm_packs_epi32(u[4], u[5]); - s[14] = _mm_packs_epi32(u[6], u[7]); - s[10] = _mm_packs_epi32(u[8], u[9]); - s[13] = _mm_packs_epi32(u[10], u[11]); - s[11] = _mm_packs_epi32(u[12], u[13]); - s[12] = _mm_packs_epi32(u[14], u[15]); +// Group the coefficient calculation into smaller functions to prevent stack +// spillover in 32x32 idct optimizations: +// quarter_1: 0-7 +// quarter_2: 8-15 +// quarter_3_4: 16-23, 24-31 + +// For each 8x32 block __m128i in[32], +// Input with index, 0, 4 +// output pixels: 0-7 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[8]*/) { + const __m128i zero = _mm_setzero_si128(); + __m128i step1[8], step2[8]; // stage 3 - t[0] = s[0]; - t[1] = s[1]; - t[2] = s[2]; - t[3] = s[3]; - u[0] = _mm_unpacklo_epi16(s[4], s[7]); - u[1] = _mm_unpackhi_epi16(s[4], s[7]); - u[2] = _mm_unpacklo_epi16(s[5], s[6]); - u[3] = _mm_unpackhi_epi16(s[5], s[6]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); - v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); - v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); - v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); - v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); - v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); - v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); - v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - t[4] = _mm_packs_epi32(u[0], u[1]); - t[7] = _mm_packs_epi32(u[2], u[3]); - t[5] = _mm_packs_epi32(u[4], u[5]); - t[6] = _mm_packs_epi32(u[6], u[7]); - t[8] = _mm_add_epi16(s[8], s[9]); - t[9] = _mm_sub_epi16(s[8], s[9]); - t[10] = _mm_sub_epi16(s[11], s[10]); - t[11] = _mm_add_epi16(s[10], s[11]); - t[12] = _mm_add_epi16(s[12], s[13]); - t[13] = _mm_sub_epi16(s[12], s[13]); - t[14] = _mm_sub_epi16(s[15], s[14]); - t[15] = _mm_add_epi16(s[14], s[15]); + butterfly(in[4], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]); // stage 4 - u[0] = _mm_unpacklo_epi16(t[0], t[1]); - u[1] = _mm_unpackhi_epi16(t[0], t[1]); - u[2] = _mm_unpacklo_epi16(t[2], t[3]); - u[3] = _mm_unpackhi_epi16(t[2], t[3]); - u[4] = _mm_unpacklo_epi16(t[9], t[14]); - u[5] = _mm_unpackhi_epi16(t[9], t[14]); - u[6] = _mm_unpacklo_epi16(t[10], t[13]); - u[7] = _mm_unpackhi_epi16(t[10], t[13]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); - v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); - v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); - v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); - v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); - v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); - v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); - v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); - v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); - v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); - v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); - v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[0] = _mm_packs_epi32(u[0], u[1]); - s[1] = _mm_packs_epi32(u[2], u[3]); - s[2] = _mm_packs_epi32(u[4], u[5]); - s[3] = _mm_packs_epi32(u[6], u[7]); - s[4] = _mm_add_epi16(t[4], t[5]); - s[5] = _mm_sub_epi16(t[4], t[5]); - s[6] = _mm_sub_epi16(t[7], t[6]); - s[7] = _mm_add_epi16(t[6], t[7]); - s[8] = t[8]; - s[15] = t[15]; - s[9] = _mm_packs_epi32(u[8], u[9]); - s[14] = _mm_packs_epi32(u[10], u[11]); - s[10] = _mm_packs_epi32(u[12], u[13]); - s[13] = _mm_packs_epi32(u[14], u[15]); - s[11] = t[11]; - s[12] = t[12]; + step2[0] = butterfly_cospi16(in[0]); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; // stage 5 - t[0] = _mm_add_epi16(s[0], s[3]); - t[1] = _mm_add_epi16(s[1], s[2]); - t[2] = _mm_sub_epi16(s[1], s[2]); - t[3] = _mm_sub_epi16(s[0], s[3]); - t[4] = s[4]; - t[7] = s[7]; - - u[0] = _mm_unpacklo_epi16(s[5], s[6]); - u[1] = _mm_unpackhi_epi16(s[5], s[6]); - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - t[5] = _mm_packs_epi32(u[0], u[1]); - t[6] = _mm_packs_epi32(u[2], u[3]); - - t[8] = _mm_add_epi16(s[8], s[11]); - t[9] = _mm_add_epi16(s[9], s[10]); - t[10] = _mm_sub_epi16(s[9], s[10]); - t[11] = _mm_sub_epi16(s[8], s[11]); - t[12] = _mm_sub_epi16(s[15], s[12]); - t[13] = _mm_sub_epi16(s[14], s[13]); - t[14] = _mm_add_epi16(s[13], s[14]); - t[15] = _mm_add_epi16(s[12], s[15]); + step1[0] = step2[0]; + step1[1] = step2[0]; + step1[2] = step2[0]; + step1[3] = step2[0]; + step1[4] = step2[4]; + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[7] = step2[7]; // stage 6 - s[0] = _mm_add_epi16(t[0], t[7]); - s[1] = _mm_add_epi16(t[1], t[6]); - s[2] = _mm_add_epi16(t[2], t[5]); - s[3] = _mm_add_epi16(t[3], t[4]); - s[4] = _mm_sub_epi16(t[3], t[4]); - s[5] = _mm_sub_epi16(t[2], t[5]); - s[6] = _mm_sub_epi16(t[1], t[6]); - s[7] = _mm_sub_epi16(t[0], t[7]); - s[8] = t[8]; - s[9] = t[9]; - - u[0] = _mm_unpacklo_epi16(t[10], t[13]); - u[1] = _mm_unpackhi_epi16(t[10], t[13]); - u[2] = _mm_unpacklo_epi16(t[11], t[12]); - u[3] = _mm_unpackhi_epi16(t[11], t[12]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - s[10] = _mm_packs_epi32(u[0], u[1]); - s[13] = _mm_packs_epi32(u[2], u[3]); - s[11] = _mm_packs_epi32(u[4], u[5]); - s[12] = _mm_packs_epi32(u[6], u[7]); - s[14] = t[14]; - s[15] = t[15]; - - // stage 7 - in[0] = _mm_add_epi16(s[0], s[15]); - in[1] = _mm_add_epi16(s[1], s[14]); - in[2] = _mm_add_epi16(s[2], s[13]); - in[3] = _mm_add_epi16(s[3], s[12]); - in[4] = _mm_add_epi16(s[4], s[11]); - in[5] = _mm_add_epi16(s[5], s[10]); - in[6] = _mm_add_epi16(s[6], s[9]); - in[7] = _mm_add_epi16(s[7], s[8]); - in[8] = _mm_sub_epi16(s[7], s[8]); - in[9] = _mm_sub_epi16(s[6], s[9]); - in[10] = _mm_sub_epi16(s[5], s[10]); - in[11] = _mm_sub_epi16(s[4], s[11]); - in[12] = _mm_sub_epi16(s[3], s[12]); - in[13] = _mm_sub_epi16(s[2], s[13]); - in[14] = _mm_sub_epi16(s[1], s[14]); - in[15] = _mm_sub_epi16(s[0], s[15]); + out[0] = _mm_add_epi16(step1[0], step1[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step1[4]); + out[4] = _mm_sub_epi16(step1[3], step1[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step1[7]); } -void idct16_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); - idct16_8col(in0); - idct16_8col(in1); +// For each 8x32 block __m128i in[32], +// Input with index, 2, 6 +// output pixels: 8-15 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[16]*/) { + const __m128i zero = _mm_setzero_si128(); + __m128i step1[16], step2[16]; + + // stage 2 + butterfly(in[2], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly(zero, in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[14] = step2[15]; + step1[15] = step2[15]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + + idct32_8x32_quarter_2_stage_4_to_6(step1, out); } -void iadst16_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); - iadst16_8col(in0); - iadst16_8col(in1); +static INLINE void idct32_34_8x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + idct32_34_8x32_quarter_1(in, temp); + idct32_34_8x32_quarter_2(in, temp); + // stage 7 + add_sub_butterfly(temp, out, 16); } -void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); +// For each 8x32 block __m128i in[32], +// Input with odd index, 1, 3, 5, 7 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { const __m128i zero = _mm_setzero_si128(); + __m128i step1[32]; - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + // stage 1 + butterfly(in[1], zero, cospi_31_64, cospi_1_64, &step1[16], &step1[31]); + butterfly(zero, in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]); + butterfly(in[5], zero, cospi_27_64, cospi_5_64, &step1[20], &step1[27]); + butterfly(zero, in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]); - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + // stage 3 + butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + + idct32_8x32_quarter_3_4_stage_4_to_7(step1, out); +} - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); +void idct32_34_8x32_sse2(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[32]*/) { + __m128i temp[32]; - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i in[16], l[16]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8, - stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0, - stp1_12_0; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - // First 1-D inverse DCT - // Load input data. - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8 * 2); - in[2] = load_input_data(input + 8 * 4); - in[3] = load_input_data(input + 8 * 6); - - TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); - - // Stage2 - { - const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); - const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); - - tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); - tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); - tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); - tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - tmp7 = _mm_add_epi32(tmp7, rounding); - - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - - stp2_8 = _mm_packs_epi32(tmp0, tmp2); - stp2_11 = _mm_packs_epi32(tmp5, tmp7); - } + idct32_34_8x32_quarter_1_2(in, temp); + idct32_34_8x32_quarter_3_4(in, temp); + // final stage + add_sub_butterfly(temp, out, 32); +} - // Stage3 - { - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); +// Only upper-left 8x8 has non-zero coeff +void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i io[32], col[32]; + int i; - tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); - tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); + // Load input data. Only need to load the top left 8x8 block. + load_transpose_16bit_8x8(input, 32, io); + idct32_34_8x32_sse2(io, col); - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + for (i = 0; i < 32; i += 8) { + int j; + transpose_16bit_8x8(col + i, io); + idct32_34_8x32_sse2(io, io); - stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); - stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); + for (j = 0; j < 32; ++j) { + write_buffer_8x1(dest + j * stride, io[j]); + } - stp1_4 = _mm_packs_epi32(tmp0, tmp2); + dest += 8; } +} - // Stage4 - { - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); - - tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); - tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); - tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); - tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); - tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); - tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - tmp7 = _mm_add_epi32(tmp7, rounding); - - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - - stp1_0 = _mm_packs_epi32(tmp0, tmp0); - stp1_1 = _mm_packs_epi32(tmp2, tmp2); - stp2_9 = _mm_packs_epi32(tmp1, tmp3); - stp2_10 = _mm_packs_epi32(tmp5, tmp7); - - stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); - } +// For each 8x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m128i out[32] +static INLINE void idct32_1024_8x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; - // Stage5 and Stage6 - { - tmp0 = _mm_add_epi16(stp2_8, stp2_11); - tmp1 = _mm_sub_epi16(stp2_8, stp2_11); - tmp2 = _mm_add_epi16(stp2_9, stp2_10); - tmp3 = _mm_sub_epi16(stp2_9, stp2_10); - - stp1_9 = _mm_unpacklo_epi64(tmp2, zero); - stp1_10 = _mm_unpacklo_epi64(tmp3, zero); - stp1_8 = _mm_unpacklo_epi64(tmp0, zero); - stp1_11 = _mm_unpacklo_epi64(tmp1, zero); - - stp1_13 = _mm_unpackhi_epi64(tmp3, zero); - stp1_14 = _mm_unpackhi_epi64(tmp2, zero); - stp1_12 = _mm_unpackhi_epi64(tmp1, zero); - stp1_15 = _mm_unpackhi_epi64(tmp0, zero); - } + // stage 3 + butterfly(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); - // Stage6 - { - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); - - tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); - tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); - tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); - tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); - tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); - tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); - - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp1_6 = _mm_packs_epi32(tmp3, tmp1); - - stp2_10 = _mm_packs_epi32(tmp0, zero); - stp2_13 = _mm_packs_epi32(tmp2, zero); - stp2_11 = _mm_packs_epi32(tmp4, zero); - stp2_12 = _mm_packs_epi32(tmp6, zero); - - tmp0 = _mm_add_epi16(stp1_0, stp1_4); - tmp1 = _mm_sub_epi16(stp1_0, stp1_4); - tmp2 = _mm_add_epi16(stp1_1, stp1_6); - tmp3 = _mm_sub_epi16(stp1_1, stp1_6); - - stp2_0 = _mm_unpackhi_epi64(tmp0, zero); - stp2_1 = _mm_unpacklo_epi64(tmp2, zero); - stp2_2 = _mm_unpackhi_epi64(tmp2, zero); - stp2_3 = _mm_unpacklo_epi64(tmp0, zero); - stp2_4 = _mm_unpacklo_epi64(tmp1, zero); - stp2_5 = _mm_unpackhi_epi64(tmp3, zero); - stp2_6 = _mm_unpacklo_epi64(tmp3, zero); - stp2_7 = _mm_unpackhi_epi64(tmp1, zero); - } + // stage 4 + butterfly(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); - // Stage7. Left 8x16 only. - l[0] = _mm_add_epi16(stp2_0, stp1_15); - l[1] = _mm_add_epi16(stp2_1, stp1_14); - l[2] = _mm_add_epi16(stp2_2, stp2_13); - l[3] = _mm_add_epi16(stp2_3, stp2_12); - l[4] = _mm_add_epi16(stp2_4, stp2_11); - l[5] = _mm_add_epi16(stp2_5, stp2_10); - l[6] = _mm_add_epi16(stp2_6, stp1_9); - l[7] = _mm_add_epi16(stp2_7, stp1_8); - l[8] = _mm_sub_epi16(stp2_7, stp1_8); - l[9] = _mm_sub_epi16(stp2_6, stp1_9); - l[10] = _mm_sub_epi16(stp2_5, stp2_10); - l[11] = _mm_sub_epi16(stp2_4, stp2_11); - l[12] = _mm_sub_epi16(stp2_3, stp2_12); - l[13] = _mm_sub_epi16(stp2_2, stp2_13); - l[14] = _mm_sub_epi16(stp2_1, stp1_14); - l[15] = _mm_sub_epi16(stp2_0, stp1_15); + // stage 5 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[1], step2[2]); + step1[2] = _mm_sub_epi16(step2[1], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + step1[4] = step2[4]; + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[7] = step2[7]; - // Second 1-D inverse transform, performed per 8x16 block - for (i = 0; i < 2; i++) { - int j; - array_transpose_4X8(l + 8 * i, in); - - IDCT16_10 - - // Stage7 - in[0] = _mm_add_epi16(stp2_0, stp1_15); - in[1] = _mm_add_epi16(stp2_1, stp1_14); - in[2] = _mm_add_epi16(stp2_2, stp2_13); - in[3] = _mm_add_epi16(stp2_3, stp2_12); - in[4] = _mm_add_epi16(stp2_4, stp2_11); - in[5] = _mm_add_epi16(stp2_5, stp2_10); - in[6] = _mm_add_epi16(stp2_6, stp1_9); - in[7] = _mm_add_epi16(stp2_7, stp1_8); - in[8] = _mm_sub_epi16(stp2_7, stp1_8); - in[9] = _mm_sub_epi16(stp2_6, stp1_9); - in[10] = _mm_sub_epi16(stp2_5, stp2_10); - in[11] = _mm_sub_epi16(stp2_4, stp2_11); - in[12] = _mm_sub_epi16(stp2_3, stp2_12); - in[13] = _mm_sub_epi16(stp2_2, stp2_13); - in[14] = _mm_sub_epi16(stp2_1, stp1_14); - in[15] = _mm_sub_epi16(stp2_0, stp1_15); + // stage 6 + out[0] = _mm_add_epi16(step1[0], step1[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step1[4]); + out[4] = _mm_sub_epi16(step1[3], step1[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step1[7]); +} - for (j = 0; j < 16; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } +// For each 8x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m128i out[32] +static INLINE void idct32_1024_8x32_quarter_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[16]*/) { + __m128i step1[16], step2[16]; - dest += 8; - } + // stage 2 + butterfly(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); + butterfly(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); + butterfly(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi16(step2[8], step2[9]); + step1[9] = _mm_sub_epi16(step2[8], step2[9]); + step1[10] = _mm_sub_epi16(step2[11], step2[10]); + step1[11] = _mm_add_epi16(step2[11], step2[10]); + step1[12] = _mm_add_epi16(step2[12], step2[13]); + step1[13] = _mm_sub_epi16(step2[12], step2[13]); + step1[14] = _mm_sub_epi16(step2[15], step2[14]); + step1[15] = _mm_add_epi16(step2[15], step2[14]); + + idct32_8x32_quarter_2_stage_4_to_6(step1, out); } -#define LOAD_DQCOEFF(reg, input) \ - { \ - reg = load_input_data(input); \ - input += 8; \ - } +static INLINE void idct32_1024_8x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + idct32_1024_8x32_quarter_1(in, temp); + idct32_1024_8x32_quarter_2(in, temp); + // stage 7 + add_sub_butterfly(temp, out, 16); +} -#define IDCT32_34 \ - /* Stage1 */ \ - { \ - const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ - \ - const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ - \ - const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ - \ - MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16, \ - stp1_31); \ - MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19, \ - stp1_28); \ - MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20, \ - stp1_27); \ - MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23, \ - stp1_24); \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ - \ - const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ - \ - MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8, \ - stp2_15); \ - MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11, \ - stp2_12); \ - \ - stp2_16 = stp1_16; \ - stp2_19 = stp1_19; \ - \ - stp2_20 = stp1_20; \ - stp2_23 = stp1_23; \ - \ - stp2_24 = stp1_24; \ - stp2_27 = stp1_27; \ - \ - stp2_28 = stp1_28; \ - stp2_31 = stp1_31; \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ - \ - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ - \ - MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4, \ - stp1_7); \ - \ - stp1_8 = stp2_8; \ - stp1_11 = stp2_11; \ - stp1_12 = stp2_12; \ - stp1_15 = stp2_15; \ - \ - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ - stp1_29) \ - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ - stp1_25) \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ - \ - MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0, \ - stp2_1); \ - \ - stp2_4 = stp1_4; \ - stp2_5 = stp1_4; \ - stp2_6 = stp1_7; \ - stp2_7 = stp1_7; \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - stp1_0 = stp2_0; \ - stp1_1 = stp2_1; \ - stp1_2 = stp2_1; \ - stp1_3 = stp2_0; \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ - stp1_28) \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ - \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ - } \ - \ - /* Stage7 */ \ - { \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ - stp1_24) \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } +// For each 8x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void idct32_1024_8x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; -#define IDCT32 \ - /* Stage1 */ \ - { \ - const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ - const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ - const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ - \ - const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ - const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ - const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ - const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ - const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ - \ - const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ - const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ - const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ - stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \ - stp1_30) \ - MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \ - stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \ - MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ - stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ - stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ - stp1_23, stp1_24) \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ - const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ - const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ - \ - const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ - const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ - const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ - \ - MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ - stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ - stp2_14) \ - MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ - stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ - stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ - stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ - \ - stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ - stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ - stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ - stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ - \ - stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ - stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ - stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ - const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ - const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ - \ - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - \ - MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ - stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ - stp1_6) \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - \ - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ - stp1_29) \ - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ - stp1_25) \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ - const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ - const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \ - stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ - stp1_28) \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ - \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ - } \ - \ - /* Stage7 */ \ - { \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ - stp1_24) \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } + // stage 1 + butterfly(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]); + butterfly(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]); + butterfly(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]); + butterfly(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]); -// Only upper-left 8x8 has non-zero coeff -void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - - // idct constants for each stage - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[32], col[32]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, - stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, - stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; + butterfly(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]); + butterfly(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]); - // Load input data. Only need to load the top left 8x8 block. - in[0] = load_input_data(input); - in[1] = load_input_data(input + 32); - in[2] = load_input_data(input + 64); - in[3] = load_input_data(input + 96); - in[4] = load_input_data(input + 128); - in[5] = load_input_data(input + 160); - in[6] = load_input_data(input + 192); - in[7] = load_input_data(input + 224); - - array_transpose_8x8(in, in); - IDCT32_34 - - // 1_D: Store 32 intermediate results for each 8x32 block. - col[0] = _mm_add_epi16(stp1_0, stp1_31); - col[1] = _mm_add_epi16(stp1_1, stp1_30); - col[2] = _mm_add_epi16(stp1_2, stp1_29); - col[3] = _mm_add_epi16(stp1_3, stp1_28); - col[4] = _mm_add_epi16(stp1_4, stp1_27); - col[5] = _mm_add_epi16(stp1_5, stp1_26); - col[6] = _mm_add_epi16(stp1_6, stp1_25); - col[7] = _mm_add_epi16(stp1_7, stp1_24); - col[8] = _mm_add_epi16(stp1_8, stp1_23); - col[9] = _mm_add_epi16(stp1_9, stp1_22); - col[10] = _mm_add_epi16(stp1_10, stp1_21); - col[11] = _mm_add_epi16(stp1_11, stp1_20); - col[12] = _mm_add_epi16(stp1_12, stp1_19); - col[13] = _mm_add_epi16(stp1_13, stp1_18); - col[14] = _mm_add_epi16(stp1_14, stp1_17); - col[15] = _mm_add_epi16(stp1_15, stp1_16); - col[16] = _mm_sub_epi16(stp1_15, stp1_16); - col[17] = _mm_sub_epi16(stp1_14, stp1_17); - col[18] = _mm_sub_epi16(stp1_13, stp1_18); - col[19] = _mm_sub_epi16(stp1_12, stp1_19); - col[20] = _mm_sub_epi16(stp1_11, stp1_20); - col[21] = _mm_sub_epi16(stp1_10, stp1_21); - col[22] = _mm_sub_epi16(stp1_9, stp1_22); - col[23] = _mm_sub_epi16(stp1_8, stp1_23); - col[24] = _mm_sub_epi16(stp1_7, stp1_24); - col[25] = _mm_sub_epi16(stp1_6, stp1_25); - col[26] = _mm_sub_epi16(stp1_5, stp1_26); - col[27] = _mm_sub_epi16(stp1_4, stp1_27); - col[28] = _mm_sub_epi16(stp1_3, stp1_28); - col[29] = _mm_sub_epi16(stp1_2, stp1_29); - col[30] = _mm_sub_epi16(stp1_1, stp1_30); - col[31] = _mm_sub_epi16(stp1_0, stp1_31); - for (i = 0; i < 4; i++) { - int j; - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + i * 8, in); - IDCT32_34 - - // 2_D: Calculate the results and store them to destination. - in[0] = _mm_add_epi16(stp1_0, stp1_31); - in[1] = _mm_add_epi16(stp1_1, stp1_30); - in[2] = _mm_add_epi16(stp1_2, stp1_29); - in[3] = _mm_add_epi16(stp1_3, stp1_28); - in[4] = _mm_add_epi16(stp1_4, stp1_27); - in[5] = _mm_add_epi16(stp1_5, stp1_26); - in[6] = _mm_add_epi16(stp1_6, stp1_25); - in[7] = _mm_add_epi16(stp1_7, stp1_24); - in[8] = _mm_add_epi16(stp1_8, stp1_23); - in[9] = _mm_add_epi16(stp1_9, stp1_22); - in[10] = _mm_add_epi16(stp1_10, stp1_21); - in[11] = _mm_add_epi16(stp1_11, stp1_20); - in[12] = _mm_add_epi16(stp1_12, stp1_19); - in[13] = _mm_add_epi16(stp1_13, stp1_18); - in[14] = _mm_add_epi16(stp1_14, stp1_17); - in[15] = _mm_add_epi16(stp1_15, stp1_16); - in[16] = _mm_sub_epi16(stp1_15, stp1_16); - in[17] = _mm_sub_epi16(stp1_14, stp1_17); - in[18] = _mm_sub_epi16(stp1_13, stp1_18); - in[19] = _mm_sub_epi16(stp1_12, stp1_19); - in[20] = _mm_sub_epi16(stp1_11, stp1_20); - in[21] = _mm_sub_epi16(stp1_10, stp1_21); - in[22] = _mm_sub_epi16(stp1_9, stp1_22); - in[23] = _mm_sub_epi16(stp1_8, stp1_23); - in[24] = _mm_sub_epi16(stp1_7, stp1_24); - in[25] = _mm_sub_epi16(stp1_6, stp1_25); - in[26] = _mm_sub_epi16(stp1_5, stp1_26); - in[27] = _mm_sub_epi16(stp1_4, stp1_27); - in[28] = _mm_sub_epi16(stp1_3, stp1_28); - in[29] = _mm_sub_epi16(stp1_2, stp1_29); - in[30] = _mm_sub_epi16(stp1_1, stp1_30); - in[31] = _mm_sub_epi16(stp1_0, stp1_31); + butterfly(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]); + butterfly(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]); - for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } + // stage 2 + step2[16] = _mm_add_epi16(step1[16], step1[17]); + step2[17] = _mm_sub_epi16(step1[16], step1[17]); + step2[18] = _mm_sub_epi16(step1[19], step1[18]); + step2[19] = _mm_add_epi16(step1[19], step1[18]); + step2[20] = _mm_add_epi16(step1[20], step1[21]); + step2[21] = _mm_sub_epi16(step1[20], step1[21]); + step2[22] = _mm_sub_epi16(step1[23], step1[22]); + step2[23] = _mm_add_epi16(step1[23], step1[22]); + + step2[24] = _mm_add_epi16(step1[24], step1[25]); + step2[25] = _mm_sub_epi16(step1[24], step1[25]); + step2[26] = _mm_sub_epi16(step1[27], step1[26]); + step2[27] = _mm_add_epi16(step1[27], step1[26]); + step2[28] = _mm_add_epi16(step1[28], step1[29]); + step2[29] = _mm_sub_epi16(step1[28], step1[29]); + step2[30] = _mm_sub_epi16(step1[31], step1[30]); + step2[31] = _mm_add_epi16(step1[31], step1[30]); - dest += 8; - } + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + idct32_8x32_quarter_3_4_stage_4_to_7(step1, out); +} + +void idct32_1024_8x32(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[32]*/) { + __m128i temp[32]; + + idct32_1024_8x32_quarter_1_2(in, temp); + idct32_1024_8x32_quarter_3_4(in, temp); + // final stage + add_sub_butterfly(temp, out, 32); } void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - // idct constants for each stage - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[32], col[128], zero_idx[16]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, - stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, - stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i, j, i32; + __m128i col[4][32], io[32]; + int i; + // rows for (i = 0; i < 4; i++) { - i32 = (i << 5); - // First 1-D idct - // Load input data. - LOAD_DQCOEFF(in[0], input); - LOAD_DQCOEFF(in[8], input); - LOAD_DQCOEFF(in[16], input); - LOAD_DQCOEFF(in[24], input); - LOAD_DQCOEFF(in[1], input); - LOAD_DQCOEFF(in[9], input); - LOAD_DQCOEFF(in[17], input); - LOAD_DQCOEFF(in[25], input); - LOAD_DQCOEFF(in[2], input); - LOAD_DQCOEFF(in[10], input); - LOAD_DQCOEFF(in[18], input); - LOAD_DQCOEFF(in[26], input); - LOAD_DQCOEFF(in[3], input); - LOAD_DQCOEFF(in[11], input); - LOAD_DQCOEFF(in[19], input); - LOAD_DQCOEFF(in[27], input); - - LOAD_DQCOEFF(in[4], input); - LOAD_DQCOEFF(in[12], input); - LOAD_DQCOEFF(in[20], input); - LOAD_DQCOEFF(in[28], input); - LOAD_DQCOEFF(in[5], input); - LOAD_DQCOEFF(in[13], input); - LOAD_DQCOEFF(in[21], input); - LOAD_DQCOEFF(in[29], input); - LOAD_DQCOEFF(in[6], input); - LOAD_DQCOEFF(in[14], input); - LOAD_DQCOEFF(in[22], input); - LOAD_DQCOEFF(in[30], input); - LOAD_DQCOEFF(in[7], input); - LOAD_DQCOEFF(in[15], input); - LOAD_DQCOEFF(in[23], input); - LOAD_DQCOEFF(in[31], input); - - // checking if all entries are zero - zero_idx[0] = _mm_or_si128(in[0], in[1]); - zero_idx[1] = _mm_or_si128(in[2], in[3]); - zero_idx[2] = _mm_or_si128(in[4], in[5]); - zero_idx[3] = _mm_or_si128(in[6], in[7]); - zero_idx[4] = _mm_or_si128(in[8], in[9]); - zero_idx[5] = _mm_or_si128(in[10], in[11]); - zero_idx[6] = _mm_or_si128(in[12], in[13]); - zero_idx[7] = _mm_or_si128(in[14], in[15]); - zero_idx[8] = _mm_or_si128(in[16], in[17]); - zero_idx[9] = _mm_or_si128(in[18], in[19]); - zero_idx[10] = _mm_or_si128(in[20], in[21]); - zero_idx[11] = _mm_or_si128(in[22], in[23]); - zero_idx[12] = _mm_or_si128(in[24], in[25]); - zero_idx[13] = _mm_or_si128(in[26], in[27]); - zero_idx[14] = _mm_or_si128(in[28], in[29]); - zero_idx[15] = _mm_or_si128(in[30], in[31]); - - zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); - zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); - zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); - zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); - zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); - zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); - zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); - zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); - - zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); - zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); - zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); - zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); - zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); - zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); - zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); - - if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { - col[i32 + 0] = _mm_setzero_si128(); - col[i32 + 1] = _mm_setzero_si128(); - col[i32 + 2] = _mm_setzero_si128(); - col[i32 + 3] = _mm_setzero_si128(); - col[i32 + 4] = _mm_setzero_si128(); - col[i32 + 5] = _mm_setzero_si128(); - col[i32 + 6] = _mm_setzero_si128(); - col[i32 + 7] = _mm_setzero_si128(); - col[i32 + 8] = _mm_setzero_si128(); - col[i32 + 9] = _mm_setzero_si128(); - col[i32 + 10] = _mm_setzero_si128(); - col[i32 + 11] = _mm_setzero_si128(); - col[i32 + 12] = _mm_setzero_si128(); - col[i32 + 13] = _mm_setzero_si128(); - col[i32 + 14] = _mm_setzero_si128(); - col[i32 + 15] = _mm_setzero_si128(); - col[i32 + 16] = _mm_setzero_si128(); - col[i32 + 17] = _mm_setzero_si128(); - col[i32 + 18] = _mm_setzero_si128(); - col[i32 + 19] = _mm_setzero_si128(); - col[i32 + 20] = _mm_setzero_si128(); - col[i32 + 21] = _mm_setzero_si128(); - col[i32 + 22] = _mm_setzero_si128(); - col[i32 + 23] = _mm_setzero_si128(); - col[i32 + 24] = _mm_setzero_si128(); - col[i32 + 25] = _mm_setzero_si128(); - col[i32 + 26] = _mm_setzero_si128(); - col[i32 + 27] = _mm_setzero_si128(); - col[i32 + 28] = _mm_setzero_si128(); - col[i32 + 29] = _mm_setzero_si128(); - col[i32 + 30] = _mm_setzero_si128(); - col[i32 + 31] = _mm_setzero_si128(); - continue; - } + load_transpose_16bit_8x8(&input[0], 32, &io[0]); + load_transpose_16bit_8x8(&input[8], 32, &io[8]); + load_transpose_16bit_8x8(&input[16], 32, &io[16]); + load_transpose_16bit_8x8(&input[24], 32, &io[24]); + idct32_1024_8x32(io, col[i]); + input += 32 << 3; + } + // columns + for (i = 0; i < 32; i += 8) { // Transpose 32x8 block to 8x32 block - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - array_transpose_8x8(in + 16, in + 16); - array_transpose_8x8(in + 24, in + 24); - - IDCT32 - - // 1_D: Store 32 intermediate results for each 8x32 block. - col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); - col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); - col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); - col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); - col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); - col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); - col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); - col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); - col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); - col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); - col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); - col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); - col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); - col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); - col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); - col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); - col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); - col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); - col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); - col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); - col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); - col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); - col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); - col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); - col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); - col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); - col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); - col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); - col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); - col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); - col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); - col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); + transpose_16bit_8x8(col[0] + i, io); + transpose_16bit_8x8(col[1] + i, io + 8); + transpose_16bit_8x8(col[2] + i, io + 16); + transpose_16bit_8x8(col[3] + i, io + 24); + + idct32_1024_8x32(io, io); + store_buffer_8x32(io, dest, stride); + dest += 8; } - for (i = 0; i < 4; i++) { - // Second 1-D idct - j = i << 3; +} - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + j, in); - array_transpose_8x8(col + j + 32, in + 8); - array_transpose_8x8(col + j + 64, in + 16); - array_transpose_8x8(col + j + 96, in + 24); - - IDCT32 - - // 2_D: Calculate the results and store them to destination. - in[0] = _mm_add_epi16(stp1_0, stp1_31); - in[1] = _mm_add_epi16(stp1_1, stp1_30); - in[2] = _mm_add_epi16(stp1_2, stp1_29); - in[3] = _mm_add_epi16(stp1_3, stp1_28); - in[4] = _mm_add_epi16(stp1_4, stp1_27); - in[5] = _mm_add_epi16(stp1_5, stp1_26); - in[6] = _mm_add_epi16(stp1_6, stp1_25); - in[7] = _mm_add_epi16(stp1_7, stp1_24); - in[8] = _mm_add_epi16(stp1_8, stp1_23); - in[9] = _mm_add_epi16(stp1_9, stp1_22); - in[10] = _mm_add_epi16(stp1_10, stp1_21); - in[11] = _mm_add_epi16(stp1_11, stp1_20); - in[12] = _mm_add_epi16(stp1_12, stp1_19); - in[13] = _mm_add_epi16(stp1_13, stp1_18); - in[14] = _mm_add_epi16(stp1_14, stp1_17); - in[15] = _mm_add_epi16(stp1_15, stp1_16); - in[16] = _mm_sub_epi16(stp1_15, stp1_16); - in[17] = _mm_sub_epi16(stp1_14, stp1_17); - in[18] = _mm_sub_epi16(stp1_13, stp1_18); - in[19] = _mm_sub_epi16(stp1_12, stp1_19); - in[20] = _mm_sub_epi16(stp1_11, stp1_20); - in[21] = _mm_sub_epi16(stp1_10, stp1_21); - in[22] = _mm_sub_epi16(stp1_9, stp1_22); - in[23] = _mm_sub_epi16(stp1_8, stp1_23); - in[24] = _mm_sub_epi16(stp1_7, stp1_24); - in[25] = _mm_sub_epi16(stp1_6, stp1_25); - in[26] = _mm_sub_epi16(stp1_5, stp1_26); - in[27] = _mm_sub_epi16(stp1_4, stp1_27); - in[28] = _mm_sub_epi16(stp1_3, stp1_28); - in[29] = _mm_sub_epi16(stp1_2, stp1_29); - in[30] = _mm_sub_epi16(stp1_1, stp1_30); - in[31] = _mm_sub_epi16(stp1_0, stp1_31); +void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i col[2][32], in[32], out[32]; + int i; - for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } + for (i = 16; i < 32; i++) { + in[i] = _mm_setzero_si128(); + } + // rows + for (i = 0; i < 2; i++) { + load_transpose_16bit_8x8(&input[0], 32, &in[0]); + load_transpose_16bit_8x8(&input[8], 32, &in[8]); + idct32_1024_8x32(in, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col[0] + i, in); + transpose_16bit_8x8(col[1] + i, in + 8); + idct32_1024_8x32(in, out); + store_buffer_8x32(out, dest, stride); dest += 8; } } @@ -3224,19 +1334,17 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a, j; + int j; + tran_high_t a1; + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 6); - - dc_value = _mm_set1_epi16(a); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 6); + dc_value = _mm_set1_epi16((int16_t)a1); for (j = 0; j < 32; ++j) { - RECON_AND_STORE(dest + 0 + j * stride, dc_value); - RECON_AND_STORE(dest + 8 + j * stride, dc_value); - RECON_AND_STORE(dest + 16 + j * stride, dc_value); - RECON_AND_STORE(dest + 24 + j * stride, dc_value); + recon_and_store_16(dest + j * stride + 0, dc_value); + recon_and_store_16(dest + j * stride + 16, dc_value); } } diff --git a/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/libvpx/vpx_dsp/x86/inv_txfm_sse2.h index 0460ab13b..5cd5098f1 100644 --- a/libvpx/vpx_dsp/x86/inv_txfm_sse2.h +++ b/libvpx/vpx_dsp/x86/inv_txfm_sse2.h @@ -12,272 +12,173 @@ #define VPX_DSP_X86_INV_TXFM_SSE2_H_ #include <emmintrin.h> // SSE2 + #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/inv_txfm.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" -// perform 8x8 transpose -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); +static INLINE void idct8x8_12_transpose_16bit_4x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 30 31 32 33 00 01 02 03 + // in[1]: 20 21 22 23 10 11 12 13 + // in[2]: 40 41 42 43 70 71 72 73 + // in[3]: 50 51 52 53 60 61 62 63 + // to: + // tr0_0: 00 10 01 11 02 12 03 13 + // tr0_1: 20 30 21 31 22 32 23 33 + // tr0_2: 40 50 41 51 42 52 43 53 + // tr0_3: 60 70 61 71 62 72 63 73 + const __m128i tr0_0 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[1], in[0]); + const __m128i tr0_2 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_3 = _mm_unpackhi_epi16(in[3], in[2]); + // Unpack 32 bit elements resulting in: + // tr1_0: 00 10 20 30 01 11 21 31 + // tr1_1: 02 12 22 32 03 13 23 33 + // tr1_2: 40 50 60 70 41 51 61 71 + // tr1_3: 42 52 62 72 43 53 63 73 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); + out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); + out[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); + out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); } -#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ - const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ - const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ - const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ - const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ - out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ - out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ - out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ - } -#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - \ - in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ - in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ - } +static INLINE __m128i dct_const_round_shift_sse2(const __m128i in) { + const __m128i t = _mm_add_epi32(in, _mm_set1_epi32(DCT_CONST_ROUNDING)); + return _mm_srai_epi32(t, DCT_CONST_BITS); +} -static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); +static INLINE __m128i idct_madd_round_shift_sse2(const __m128i in, + const __m128i cospi) { + const __m128i t = _mm_madd_epi16(in, cospi); + return dct_const_round_shift_sse2(t); +} - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); +// Calculate the dot product between in0/1 and x and wrap to short. +static INLINE __m128i idct_calc_wraplow_sse2(const __m128i in0, + const __m128i in1, + const __m128i x) { + const __m128i t0 = idct_madd_round_shift_sse2(in0, x); + const __m128i t1 = idct_madd_round_shift_sse2(in1, x); + return _mm_packs_epi32(t0, t1); +} - out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); - out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); - out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); - out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); +// Multiply elements by constants and add them together. +static INLINE void butterfly(const __m128i in0, const __m128i in1, const int c0, + const int c1, __m128i *const out0, + __m128i *const out1) { + const __m128i cst0 = pair_set_epi16(c0, -c1); + const __m128i cst1 = pair_set_epi16(c1, c0); + const __m128i lo = _mm_unpacklo_epi16(in0, in1); + const __m128i hi = _mm_unpackhi_epi16(in0, in1); + *out0 = idct_calc_wraplow_sse2(lo, hi, cst0); + *out1 = idct_calc_wraplow_sse2(lo, hi, cst1); } -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; +static INLINE __m128i butterfly_cospi16(const __m128i in) { + const __m128i cst = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i lo = _mm_unpacklo_epi16(in, _mm_setzero_si128()); + const __m128i hi = _mm_unpackhi_epi16(in, _mm_setzero_si128()); + return idct_calc_wraplow_sse2(lo, hi, cst); } -// Function to allow 8 bit optimisations to be used when profile 0 is used with +// Functions to allow 8 bit optimisations to be used when profile 0 is used with // highbitdepth enabled -static INLINE __m128i load_input_data(const tran_low_t *data) { +static INLINE __m128i load_input_data4(const tran_low_t *data) { #if CONFIG_VP9_HIGHBITDEPTH - return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5], - data[6], data[7]); + const __m128i zero = _mm_setzero_si128(); + const __m128i in = _mm_load_si128((const __m128i *)data); + return _mm_packs_epi32(in, zero); #else - return _mm_load_si128((const __m128i *)data); + return _mm_loadl_epi64((const __m128i *)data); #endif } -static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) { - in[0] = load_input_data(input + 0 * 16); - in[1] = load_input_data(input + 1 * 16); - in[2] = load_input_data(input + 2 * 16); - in[3] = load_input_data(input + 3 * 16); - in[4] = load_input_data(input + 4 * 16); - in[5] = load_input_data(input + 5 * 16); - in[6] = load_input_data(input + 6 * 16); - in[7] = load_input_data(input + 7 * 16); - - in[8] = load_input_data(input + 8 * 16); - in[9] = load_input_data(input + 9 * 16); - in[10] = load_input_data(input + 10 * 16); - in[11] = load_input_data(input + 11 * 16); - in[12] = load_input_data(input + 12 * 16); - in[13] = load_input_data(input + 13 * 16); - in[14] = load_input_data(input + 14 * 16); - in[15] = load_input_data(input + 15 * 16); +static INLINE __m128i load_input_data8(const tran_low_t *data) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i in0 = _mm_load_si128((const __m128i *)data); + const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4)); + return _mm_packs_epi32(in0, in1); +#else + return _mm_load_si128((const __m128i *)data); +#endif } -#define RECON_AND_STORE(dest, in_x) \ - { \ - __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - _mm_storel_epi64((__m128i *)(dest), d0); \ - } +static INLINE void load_transpose_16bit_8x8(const tran_low_t *input, + const int stride, + __m128i *const in) { + in[0] = load_input_data8(input + 0 * stride); + in[1] = load_input_data8(input + 1 * stride); + in[2] = load_input_data8(input + 2 * stride); + in[3] = load_input_data8(input + 3 * stride); + in[4] = load_input_data8(input + 4 * stride); + in[5] = load_input_data8(input + 5 * stride); + in[6] = load_input_data8(input + 6 * stride); + in[7] = load_input_data8(input + 7 * stride); + transpose_16bit_8x8(in, in); +} -static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { - const __m128i final_rounding = _mm_set1_epi16(1 << 5); +static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) { const __m128i zero = _mm_setzero_si128(); - // Final rounding and shift - in[0] = _mm_adds_epi16(in[0], final_rounding); - in[1] = _mm_adds_epi16(in[1], final_rounding); - in[2] = _mm_adds_epi16(in[2], final_rounding); - in[3] = _mm_adds_epi16(in[3], final_rounding); - in[4] = _mm_adds_epi16(in[4], final_rounding); - in[5] = _mm_adds_epi16(in[5], final_rounding); - in[6] = _mm_adds_epi16(in[6], final_rounding); - in[7] = _mm_adds_epi16(in[7], final_rounding); - in[8] = _mm_adds_epi16(in[8], final_rounding); - in[9] = _mm_adds_epi16(in[9], final_rounding); - in[10] = _mm_adds_epi16(in[10], final_rounding); - in[11] = _mm_adds_epi16(in[11], final_rounding); - in[12] = _mm_adds_epi16(in[12], final_rounding); - in[13] = _mm_adds_epi16(in[13], final_rounding); - in[14] = _mm_adds_epi16(in[14], final_rounding); - in[15] = _mm_adds_epi16(in[15], final_rounding); - - in[0] = _mm_srai_epi16(in[0], 6); - in[1] = _mm_srai_epi16(in[1], 6); - in[2] = _mm_srai_epi16(in[2], 6); - in[3] = _mm_srai_epi16(in[3], 6); - in[4] = _mm_srai_epi16(in[4], 6); - in[5] = _mm_srai_epi16(in[5], 6); - in[6] = _mm_srai_epi16(in[6], 6); - in[7] = _mm_srai_epi16(in[7], 6); - in[8] = _mm_srai_epi16(in[8], 6); - in[9] = _mm_srai_epi16(in[9], 6); - in[10] = _mm_srai_epi16(in[10], 6); - in[11] = _mm_srai_epi16(in[11], 6); - in[12] = _mm_srai_epi16(in[12], 6); - in[13] = _mm_srai_epi16(in[13], 6); - in[14] = _mm_srai_epi16(in[14], 6); - in[15] = _mm_srai_epi16(in[15], 6); - - RECON_AND_STORE(dest + 0 * stride, in[0]); - RECON_AND_STORE(dest + 1 * stride, in[1]); - RECON_AND_STORE(dest + 2 * stride, in[2]); - RECON_AND_STORE(dest + 3 * stride, in[3]); - RECON_AND_STORE(dest + 4 * stride, in[4]); - RECON_AND_STORE(dest + 5 * stride, in[5]); - RECON_AND_STORE(dest + 6 * stride, in[6]); - RECON_AND_STORE(dest + 7 * stride, in[7]); - RECON_AND_STORE(dest + 8 * stride, in[8]); - RECON_AND_STORE(dest + 9 * stride, in[9]); - RECON_AND_STORE(dest + 10 * stride, in[10]); - RECON_AND_STORE(dest + 11 * stride, in[11]); - RECON_AND_STORE(dest + 12 * stride, in[12]); - RECON_AND_STORE(dest + 13 * stride, in[13]); - RECON_AND_STORE(dest + 14 * stride, in[14]); - RECON_AND_STORE(dest + 15 * stride, in[15]); + __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); + d0 = _mm_unpacklo_epi8(d0, zero); + d0 = _mm_add_epi16(in_x, d0); + d0 = _mm_packus_epi16(d0, d0); + _mm_storel_epi64((__m128i *)(dest), d0); } -#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \ - { \ - const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ - const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - } +static INLINE void round_shift_8x8(const __m128i *const in, + __m128i *const out) { + const __m128i final_rounding = _mm_set1_epi16(1 << 4); -#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - } + out[0] = _mm_add_epi16(in[0], final_rounding); + out[1] = _mm_add_epi16(in[1], final_rounding); + out[2] = _mm_add_epi16(in[2], final_rounding); + out[3] = _mm_add_epi16(in[3], final_rounding); + out[4] = _mm_add_epi16(in[4], final_rounding); + out[5] = _mm_add_epi16(in[5], final_rounding); + out[6] = _mm_add_epi16(in[6], final_rounding); + out[7] = _mm_add_epi16(in[7], final_rounding); -// Define Macro for multiplying elements by constants and adding them together. -#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \ - res0, res1, res2, res3) \ - { \ - tmp0 = _mm_madd_epi16(lo_0, cst0); \ - tmp1 = _mm_madd_epi16(hi_0, cst0); \ - tmp2 = _mm_madd_epi16(lo_0, cst1); \ - tmp3 = _mm_madd_epi16(hi_0, cst1); \ - tmp4 = _mm_madd_epi16(lo_1, cst2); \ - tmp5 = _mm_madd_epi16(hi_1, cst2); \ - tmp6 = _mm_madd_epi16(lo_1, cst3); \ - tmp7 = _mm_madd_epi16(hi_1, cst3); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp4 = _mm_add_epi32(tmp4, rounding); \ - tmp5 = _mm_add_epi32(tmp5, rounding); \ - tmp6 = _mm_add_epi32(tmp6, rounding); \ - tmp7 = _mm_add_epi32(tmp7, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ - \ - res0 = _mm_packs_epi32(tmp0, tmp1); \ - res1 = _mm_packs_epi32(tmp2, tmp3); \ - res2 = _mm_packs_epi32(tmp4, tmp5); \ - res3 = _mm_packs_epi32(tmp6, tmp7); \ - } + out[0] = _mm_srai_epi16(out[0], 5); + out[1] = _mm_srai_epi16(out[1], 5); + out[2] = _mm_srai_epi16(out[2], 5); + out[3] = _mm_srai_epi16(out[3], 5); + out[4] = _mm_srai_epi16(out[4], 5); + out[5] = _mm_srai_epi16(out[5], 5); + out[6] = _mm_srai_epi16(out[6], 5); + out[7] = _mm_srai_epi16(out[7], 5); +} + +static INLINE void write_buffer_8x8(const __m128i *const in, + uint8_t *const dest, const int stride) { + __m128i t[8]; + + round_shift_8x8(in, t); + + recon_and_store(dest + 0 * stride, t[0]); + recon_and_store(dest + 1 * stride, t[1]); + recon_and_store(dest + 2 * stride, t[2]); + recon_and_store(dest + 3 * stride, t[3]); + recon_and_store(dest + 4 * stride, t[4]); + recon_and_store(dest + 5 * stride, t[5]); + recon_and_store(dest + 6 * stride, t[6]); + recon_and_store(dest + 7 * stride, t[7]); +} static INLINE void recon_and_store4x4_sse2(const __m128i *const in, uint8_t *const dest, @@ -307,11 +208,502 @@ static INLINE void recon_and_store4x4_sse2(const __m128i *const in, *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]); } -void idct4_sse2(__m128i *in); -void idct8_sse2(__m128i *in); -void idct16_sse2(__m128i *in0, __m128i *in1); -void iadst4_sse2(__m128i *in); -void iadst8_sse2(__m128i *in); -void iadst16_sse2(__m128i *in0, __m128i *in1); +static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + int j = 0; + while (j < 32) { + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding); + + in[j] = _mm_srai_epi16(in[j], 6); + in[j + 1] = _mm_srai_epi16(in[j + 1], 6); + + recon_and_store(dst, in[j]); + dst += stride; + recon_and_store(dst, in[j + 1]); + dst += stride; + j += 2; + } +} + +static INLINE void write_buffer_8x1(uint8_t *const dest, const __m128i in) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + __m128i out; + out = _mm_adds_epi16(in, final_rounding); + out = _mm_srai_epi16(out, 6); + recon_and_store(dest, out); +} + +// Only do addition and subtraction butterfly, size = 16, 32 +static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out, + int size) { + int i = 0; + const int num = size >> 1; + const int bound = size - 1; + while (i < num) { + out[i] = _mm_add_epi16(in[i], in[bound - i]); + out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]); + i++; + } +} + +static INLINE void idct8(const __m128i *const in /*in[8]*/, + __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 1 + butterfly(in[1], in[7], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly(in[5], in[3], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + + // stage 2 + butterfly(in[0], in[4], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly(in[2], in[6], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[1], step2[2]); + step1[2] = _mm_sub_epi16(step2[1], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + + // stage 4 + out[0] = _mm_add_epi16(step1[0], step2[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step2[4]); + out[4] = _mm_sub_epi16(step1[3], step2[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step2[7]); +} + +static INLINE void idct8x8_12_add_kernel_sse2(__m128i *const io /*io[8]*/) { + const __m128i zero = _mm_setzero_si128(); + const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + __m128i step1[8], step2[8], tmp[4]; + + transpose_16bit_4x4(io, io); + // io[0]: 00 10 20 30 01 11 21 31 + // io[1]: 02 12 22 32 03 13 23 33 + + // stage 1 + { + const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i lo_1 = _mm_unpackhi_epi16(io[0], zero); + const __m128i lo_3 = _mm_unpackhi_epi16(io[1], zero); + step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1); // step1 4&7 + step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3); // step1 5&6 + } + + // stage 2 + { + const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i lo_0 = _mm_unpacklo_epi16(io[0], zero); + const __m128i lo_2 = _mm_unpacklo_epi16(io[1], zero); + const __m128i t = idct_madd_round_shift_sse2(cp_16_16, lo_0); + step2[0] = _mm_packs_epi32(t, t); // step2 0&1 + step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2); // step2 3&2 + step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7 + step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6 + step2[6] = _mm_unpackhi_epi64(step2[5], zero); // step2 6 + } + + // stage 3 + { + const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]); + tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1 + tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2 + step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1 + step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0 + step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65); // step1 5&6 + } + + // stage 4 + tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0 + tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1 + tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7 + tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6 + + idct8x8_12_transpose_16bit_4x8(tmp, io); + io[4] = io[5] = io[6] = io[7] = zero; + + idct8(io, io); +} + +static INLINE void idct16_8col(const __m128i *const in /*in[16]*/, + __m128i *const out /*out[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + butterfly(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); + butterfly(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); + butterfly(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + butterfly(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + step1[8] = _mm_add_epi16(step2[8], step2[9]); + step1[9] = _mm_sub_epi16(step2[8], step2[9]); + step1[10] = _mm_sub_epi16(step2[11], step2[10]); + step1[11] = _mm_add_epi16(step2[10], step2[11]); + step1[12] = _mm_add_epi16(step2[12], step2[13]); + step1[13] = _mm_sub_epi16(step2[12], step2[13]); + step1[14] = _mm_sub_epi16(step2[15], step2[14]); + step1[15] = _mm_add_epi16(step2[14], step2[15]); + + // stage 4 + butterfly(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13], + &step2[10]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step1[4] = _mm_add_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step1[7] = _mm_add_epi16(step1[6], step1[7]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[1], step2[2]); + step1[2] = _mm_sub_epi16(step2[1], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[8] = _mm_add_epi16(step2[8], step2[11]); + step1[9] = _mm_add_epi16(step2[9], step2[10]); + step1[10] = _mm_sub_epi16(step2[9], step2[10]); + step1[11] = _mm_sub_epi16(step2[8], step2[11]); + step1[12] = _mm_sub_epi16(step2[15], step2[12]); + step1[13] = _mm_sub_epi16(step2[14], step2[13]); + step1[14] = _mm_add_epi16(step2[14], step2[13]); + step1[15] = _mm_add_epi16(step2[15], step2[12]); + + // stage 6 + step2[0] = _mm_add_epi16(step1[0], step1[7]); + step2[1] = _mm_add_epi16(step1[1], step1[6]); + step2[2] = _mm_add_epi16(step1[2], step1[5]); + step2[3] = _mm_add_epi16(step1[3], step1[4]); + step2[4] = _mm_sub_epi16(step1[3], step1[4]); + step2[5] = _mm_sub_epi16(step1[2], step1[5]); + step2[6] = _mm_sub_epi16(step1[1], step1[6]); + step2[7] = _mm_sub_epi16(step1[0], step1[7]); + butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10], + &step2[13]); + butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11], + &step2[12]); + + // stage 7 + out[0] = _mm_add_epi16(step2[0], step1[15]); + out[1] = _mm_add_epi16(step2[1], step1[14]); + out[2] = _mm_add_epi16(step2[2], step2[13]); + out[3] = _mm_add_epi16(step2[3], step2[12]); + out[4] = _mm_add_epi16(step2[4], step2[11]); + out[5] = _mm_add_epi16(step2[5], step2[10]); + out[6] = _mm_add_epi16(step2[6], step1[9]); + out[7] = _mm_add_epi16(step2[7], step1[8]); + out[8] = _mm_sub_epi16(step2[7], step1[8]); + out[9] = _mm_sub_epi16(step2[6], step1[9]); + out[10] = _mm_sub_epi16(step2[5], step2[10]); + out[11] = _mm_sub_epi16(step2[4], step2[11]); + out[12] = _mm_sub_epi16(step2[3], step2[12]); + out[13] = _mm_sub_epi16(step2[2], step2[13]); + out[14] = _mm_sub_epi16(step2[1], step1[14]); + out[15] = _mm_sub_epi16(step2[0], step1[15]); +} + +static INLINE void idct16x16_10_pass1(const __m128i *const input /*input[4]*/, + __m128i *const output /*output[16]*/) { + const __m128i zero = _mm_setzero_si128(); + const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + __m128i step1[16], step2[16]; + + transpose_16bit_4x4(input, output); + + // stage 2 + { + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i lo_1_15 = _mm_unpackhi_epi16(output[0], zero); + const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, output[1]); + step2[8] = idct_calc_wraplow_sse2(k__cospi_p30_m02, k__cospi_p02_p30, + lo_1_15); // step2 8&15 + step2[11] = idct_calc_wraplow_sse2(k__cospi_p06_m26, k__cospi_p26_p06, + lo_13_3); // step2 11&12 + } + + // stage 3 + { + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i lo_2_14 = _mm_unpacklo_epi16(output[1], zero); + step1[4] = idct_calc_wraplow_sse2(k__cospi_p28_m04, k__cospi_p04_p28, + lo_2_14); // step1 4&7 + step1[13] = _mm_unpackhi_epi64(step2[11], zero); + step1[14] = _mm_unpackhi_epi64(step2[8], zero); + } + + // stage 4 + { + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i lo_0_8 = _mm_unpacklo_epi16(output[0], zero); + const __m128i lo_9_14 = _mm_unpacklo_epi16(step2[8], step1[14]); + const __m128i lo_10_13 = _mm_unpacklo_epi16(step2[11], step1[13]); + const __m128i t = idct_madd_round_shift_sse2(lo_0_8, k__cospi_p16_p16); + step1[0] = _mm_packs_epi32(t, t); // step2 0&1 + step2[9] = idct_calc_wraplow_sse2(k__cospi_m08_p24, k__cospi_p24_p08, + lo_9_14); // step2 9&14 + step2[10] = idct_calc_wraplow_sse2(k__cospi_m24_m08, k__cospi_m08_p24, + lo_10_13); // step2 10&13 + step2[6] = _mm_unpackhi_epi64(step1[4], zero); + } + + // stage 5 + { + const __m128i lo_5_6 = _mm_unpacklo_epi16(step1[4], step2[6]); + step1[6] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_m16_p16, + lo_5_6); // step1 6&5 + step1[8] = _mm_add_epi16(step2[8], step2[11]); + step1[9] = _mm_add_epi16(step2[9], step2[10]); + step1[10] = _mm_sub_epi16(step2[9], step2[10]); + step1[11] = _mm_sub_epi16(step2[8], step2[11]); + step1[12] = _mm_unpackhi_epi64(step1[11], zero); + step1[13] = _mm_unpackhi_epi64(step1[10], zero); + step1[14] = _mm_unpackhi_epi64(step1[9], zero); + step1[15] = _mm_unpackhi_epi64(step1[8], zero); + } + + // stage 6 + { + const __m128i lo_10_13 = _mm_unpacklo_epi16(step1[10], step1[13]); + const __m128i lo_11_12 = _mm_unpacklo_epi16(step1[11], step1[12]); + step2[10] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16, + lo_10_13); // step2 10&13 + step2[11] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16, + lo_11_12); // step2 11&12 + step2[13] = _mm_unpackhi_epi64(step2[10], zero); + step2[12] = _mm_unpackhi_epi64(step2[11], zero); + step2[3] = _mm_add_epi16(step1[0], step1[4]); + step2[1] = _mm_add_epi16(step1[0], step1[6]); + step2[6] = _mm_sub_epi16(step1[0], step1[6]); + step2[4] = _mm_sub_epi16(step1[0], step1[4]); + step2[0] = _mm_unpackhi_epi64(step2[3], zero); + step2[2] = _mm_unpackhi_epi64(step2[1], zero); + step2[5] = _mm_unpackhi_epi64(step2[6], zero); + step2[7] = _mm_unpackhi_epi64(step2[4], zero); + } + + // stage 7. Left 8x16 only. + output[0] = _mm_add_epi16(step2[0], step1[15]); + output[1] = _mm_add_epi16(step2[1], step1[14]); + output[2] = _mm_add_epi16(step2[2], step2[13]); + output[3] = _mm_add_epi16(step2[3], step2[12]); + output[4] = _mm_add_epi16(step2[4], step2[11]); + output[5] = _mm_add_epi16(step2[5], step2[10]); + output[6] = _mm_add_epi16(step2[6], step1[9]); + output[7] = _mm_add_epi16(step2[7], step1[8]); + output[8] = _mm_sub_epi16(step2[7], step1[8]); + output[9] = _mm_sub_epi16(step2[6], step1[9]); + output[10] = _mm_sub_epi16(step2[5], step2[10]); + output[11] = _mm_sub_epi16(step2[4], step2[11]); + output[12] = _mm_sub_epi16(step2[3], step2[12]); + output[13] = _mm_sub_epi16(step2[2], step2[13]); + output[14] = _mm_sub_epi16(step2[1], step1[14]); + output[15] = _mm_sub_epi16(step2[0], step1[15]); +} + +static INLINE void idct16x16_10_pass2(__m128i *const l /*l[8]*/, + __m128i *const io /*io[16]*/) { + const __m128i zero = _mm_setzero_si128(); + __m128i step1[16], step2[16]; + + transpose_16bit_4x8(l, io); + + // stage 2 + butterfly(io[1], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly(zero, io[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + butterfly(io[2], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + + // stage 4 + step1[0] = butterfly_cospi16(io[0]); + butterfly(step2[15], step2[8], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly(step2[11], step2[12], -cospi_8_64, -cospi_24_64, &step2[13], + &step2[10]); + + // stage 5 + butterfly(step1[7], step1[4], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[8] = _mm_add_epi16(step2[8], step2[11]); + step1[9] = _mm_add_epi16(step2[9], step2[10]); + step1[10] = _mm_sub_epi16(step2[9], step2[10]); + step1[11] = _mm_sub_epi16(step2[8], step2[11]); + step1[12] = _mm_sub_epi16(step2[15], step2[12]); + step1[13] = _mm_sub_epi16(step2[14], step2[13]); + step1[14] = _mm_add_epi16(step2[14], step2[13]); + step1[15] = _mm_add_epi16(step2[15], step2[12]); + + // stage 6 + step2[0] = _mm_add_epi16(step1[0], step1[7]); + step2[1] = _mm_add_epi16(step1[0], step1[6]); + step2[2] = _mm_add_epi16(step1[0], step1[5]); + step2[3] = _mm_add_epi16(step1[0], step1[4]); + step2[4] = _mm_sub_epi16(step1[0], step1[4]); + step2[5] = _mm_sub_epi16(step1[0], step1[5]); + step2[6] = _mm_sub_epi16(step1[0], step1[6]); + step2[7] = _mm_sub_epi16(step1[0], step1[7]); + butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10], + &step2[13]); + butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11], + &step2[12]); + + // stage 7 + io[0] = _mm_add_epi16(step2[0], step1[15]); + io[1] = _mm_add_epi16(step2[1], step1[14]); + io[2] = _mm_add_epi16(step2[2], step2[13]); + io[3] = _mm_add_epi16(step2[3], step2[12]); + io[4] = _mm_add_epi16(step2[4], step2[11]); + io[5] = _mm_add_epi16(step2[5], step2[10]); + io[6] = _mm_add_epi16(step2[6], step1[9]); + io[7] = _mm_add_epi16(step2[7], step1[8]); + io[8] = _mm_sub_epi16(step2[7], step1[8]); + io[9] = _mm_sub_epi16(step2[6], step1[9]); + io[10] = _mm_sub_epi16(step2[5], step2[10]); + io[11] = _mm_sub_epi16(step2[4], step2[11]); + io[12] = _mm_sub_epi16(step2[3], step2[12]); + io[13] = _mm_sub_epi16(step2[2], step2[13]); + io[14] = _mm_sub_epi16(step2[1], step1[14]); + io[15] = _mm_sub_epi16(step2[0], step1[15]); +} + +static INLINE void idct32_8x32_quarter_2_stage_4_to_6( + __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) { + __m128i step2[32]; + + // stage 4 + step2[8] = step1[8]; + step2[15] = step1[15]; + butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10], + &step2[13]); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[8] = _mm_add_epi16(step2[8], step2[11]); + step1[9] = _mm_add_epi16(step2[9], step2[10]); + step1[10] = _mm_sub_epi16(step2[9], step2[10]); + step1[11] = _mm_sub_epi16(step2[8], step2[11]); + step1[12] = _mm_sub_epi16(step2[15], step2[12]); + step1[13] = _mm_sub_epi16(step2[14], step2[13]); + step1[14] = _mm_add_epi16(step2[14], step2[13]); + step1[15] = _mm_add_epi16(step2[15], step2[12]); + + // stage 6 + out[8] = step1[8]; + out[9] = step1[9]; + butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10], &out[13]); + butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11], &out[12]); + out[14] = step1[14]; + out[15] = step1[15]; +} + +static INLINE void idct32_8x32_quarter_3_4_stage_4_to_7( + __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) { + __m128i step2[32]; + + // stage 4 + step2[16] = _mm_add_epi16(step1[16], step1[19]); + step2[17] = _mm_add_epi16(step1[17], step1[18]); + step2[18] = _mm_sub_epi16(step1[17], step1[18]); + step2[19] = _mm_sub_epi16(step1[16], step1[19]); + step2[20] = _mm_sub_epi16(step1[23], step1[20]); + step2[21] = _mm_sub_epi16(step1[22], step1[21]); + step2[22] = _mm_add_epi16(step1[22], step1[21]); + step2[23] = _mm_add_epi16(step1[23], step1[20]); + + step2[24] = _mm_add_epi16(step1[24], step1[27]); + step2[25] = _mm_add_epi16(step1[25], step1[26]); + step2[26] = _mm_sub_epi16(step1[25], step1[26]); + step2[27] = _mm_sub_epi16(step1[24], step1[27]); + step2[28] = _mm_sub_epi16(step1[31], step1[28]); + step2[29] = _mm_sub_epi16(step1[30], step1[29]); + step2[30] = _mm_add_epi16(step1[29], step1[30]); + step2[31] = _mm_add_epi16(step1[28], step1[31]); + + // stage 5 + step1[16] = step2[16]; + step1[17] = step2[17]; + butterfly(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18], + &step1[29]); + butterfly(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19], + &step1[28]); + butterfly(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20], + &step1[27]); + butterfly(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21], + &step1[26]); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + out[16] = _mm_add_epi16(step1[16], step1[23]); + out[17] = _mm_add_epi16(step1[17], step1[22]); + out[18] = _mm_add_epi16(step1[18], step1[21]); + out[19] = _mm_add_epi16(step1[19], step1[20]); + step2[20] = _mm_sub_epi16(step1[19], step1[20]); + step2[21] = _mm_sub_epi16(step1[18], step1[21]); + step2[22] = _mm_sub_epi16(step1[17], step1[22]); + step2[23] = _mm_sub_epi16(step1[16], step1[23]); + + step2[24] = _mm_sub_epi16(step1[31], step1[24]); + step2[25] = _mm_sub_epi16(step1[30], step1[25]); + step2[26] = _mm_sub_epi16(step1[29], step1[26]); + step2[27] = _mm_sub_epi16(step1[28], step1[27]); + out[28] = _mm_add_epi16(step1[27], step1[28]); + out[29] = _mm_add_epi16(step1[26], step1[29]); + out[30] = _mm_add_epi16(step1[25], step1[30]); + out[31] = _mm_add_epi16(step1[24], step1[31]); + + // stage 7 + butterfly(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20], &out[27]); + butterfly(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21], &out[26]); + butterfly(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22], &out[25]); + butterfly(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23], &out[24]); +} + +void idct4_sse2(__m128i *const in); +void idct8_sse2(__m128i *const in); +void idct16_sse2(__m128i *const in0, __m128i *const in1); +void iadst4_sse2(__m128i *const in); +void iadst8_sse2(__m128i *const in); +void iadst16_sse2(__m128i *const in0, __m128i *const in1); +void idct32_1024_8x32(const __m128i *const in, __m128i *const out); +void idct32_34_8x32_sse2(const __m128i *const in, __m128i *const out); +void idct32_34_8x32_ssse3(const __m128i *const in, __m128i *const out); #endif // VPX_DSP_X86_INV_TXFM_SSE2_H_ diff --git a/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c b/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c index 4d2d95787..6e99469b6 100644 --- a/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c +++ b/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c @@ -12,1322 +12,353 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_ssse3.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" -void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - - // Load input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); - in4 = load_input_data(input + 8 * 4); - in5 = load_input_data(input + 8 * 5); - in6 = load_input_data(input + 8 * 6); - in7 = load_input_data(input + 8 * 7); - - // 2-D - for (i = 0; i < 2; i++) { - // 8x8 Transpose is copied from vpx_fdct8x8_sse2() - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - - // 4-stage 1D idct8x8 - { - /* Stage1 */ - { - const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); - const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); - const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); - const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); - - { - tmp0 = _mm_madd_epi16(lo_17, stg1_0); - tmp1 = _mm_madd_epi16(hi_17, stg1_0); - tmp2 = _mm_madd_epi16(lo_17, stg1_1); - tmp3 = _mm_madd_epi16(hi_17, stg1_1); - tmp4 = _mm_madd_epi16(lo_35, stg1_2); - tmp5 = _mm_madd_epi16(hi_35, stg1_2); - tmp6 = _mm_madd_epi16(lo_35, stg1_3); - tmp7 = _mm_madd_epi16(hi_35, stg1_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp7 = _mm_add_epi32(tmp7, rounding); - - tmp0 = _mm_srai_epi32(tmp0, 14); - tmp1 = _mm_srai_epi32(tmp1, 14); - tmp2 = _mm_srai_epi32(tmp2, 14); - tmp3 = _mm_srai_epi32(tmp3, 14); - tmp4 = _mm_srai_epi32(tmp4, 14); - tmp5 = _mm_srai_epi32(tmp5, 14); - tmp6 = _mm_srai_epi32(tmp6, 14); - tmp7 = _mm_srai_epi32(tmp7, 14); - - stp1_4 = _mm_packs_epi32(tmp0, tmp1); - stp1_7 = _mm_packs_epi32(tmp2, tmp3); - stp1_5 = _mm_packs_epi32(tmp4, tmp5); - stp1_6 = _mm_packs_epi32(tmp6, tmp7); - } - } - - /* Stage2 */ - { - const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); - const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); - - { - tmp0 = _mm_unpacklo_epi16(in0, in4); - tmp1 = _mm_unpackhi_epi16(in0, in4); - - tmp2 = _mm_madd_epi16(tmp0, stk2_0); - tmp3 = _mm_madd_epi16(tmp1, stk2_0); - tmp4 = _mm_madd_epi16(tmp0, stk2_1); - tmp5 = _mm_madd_epi16(tmp1, stk2_1); - - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - - stp2_0 = _mm_packs_epi32(tmp2, tmp3); - stp2_1 = _mm_packs_epi32(tmp4, tmp5); - - tmp0 = _mm_madd_epi16(lo_26, stg2_2); - tmp1 = _mm_madd_epi16(hi_26, stg2_2); - tmp2 = _mm_madd_epi16(lo_26, stg2_3); - tmp3 = _mm_madd_epi16(hi_26, stg2_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - - tmp0 = _mm_srai_epi32(tmp0, 14); - tmp1 = _mm_srai_epi32(tmp1, 14); - tmp2 = _mm_srai_epi32(tmp2, 14); - tmp3 = _mm_srai_epi32(tmp3, 14); - - stp2_2 = _mm_packs_epi32(tmp0, tmp1); - stp2_3 = _mm_packs_epi32(tmp2, tmp3); - } - - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); - } - - /* Stage3 */ - { - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); - - tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5); - tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5); - - tmp2 = _mm_madd_epi16(tmp0, stk2_1); - tmp3 = _mm_madd_epi16(tmp1, stk2_1); - tmp4 = _mm_madd_epi16(tmp0, stk2_0); - tmp5 = _mm_madd_epi16(tmp1, stk2_0); - - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - - stp1_5 = _mm_packs_epi32(tmp2, tmp3); - stp1_6 = _mm_packs_epi32(tmp4, tmp5); - } - - /* Stage4 */ - in0 = _mm_add_epi16(stp1_0, stp2_7); - in1 = _mm_add_epi16(stp1_1, stp1_6); - in2 = _mm_add_epi16(stp1_2, stp1_5); - in3 = _mm_add_epi16(stp1_3, stp2_4); - in4 = _mm_sub_epi16(stp1_3, stp2_4); - in5 = _mm_sub_epi16(stp1_2, stp1_5); - in6 = _mm_sub_epi16(stp1_1, stp1_6); - in7 = _mm_sub_epi16(stp1_0, stp2_7); - } - } - - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); +static INLINE void partial_butterfly_ssse3(const __m128i in, const int c0, + const int c1, __m128i *const out0, + __m128i *const out1) { + const __m128i cst0 = _mm_set1_epi16(2 * c0); + const __m128i cst1 = _mm_set1_epi16(2 * c1); + *out0 = _mm_mulhrs_epi16(in, cst0); + *out1 = _mm_mulhrs_epi16(in, cst1); +} - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); +static INLINE __m128i partial_butterfly_cospi16_ssse3(const __m128i in) { + const __m128i coef_pair = _mm_set1_epi16(2 * cospi_16_64); + return _mm_mulhrs_epi16(in, coef_pair); } void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); - const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); - const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); - const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); - const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); - const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); - const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); - const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3; - - // Rows. Load 4-row input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); - - // 8x4 Transpose - TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); - - // Stage1 - tmp0 = _mm_mulhrs_epi16(in0, stg1_0); - tmp1 = _mm_mulhrs_epi16(in0, stg1_1); - tmp2 = _mm_mulhrs_epi16(in1, stg1_2); - tmp3 = _mm_mulhrs_epi16(in1, stg1_3); - - stp1_4 = _mm_unpackhi_epi64(tmp0, tmp1); - stp1_5 = _mm_unpackhi_epi64(tmp2, tmp3); - - // Stage2 - tmp0 = _mm_mulhrs_epi16(in0, stg2_0); - stp2_0 = _mm_unpacklo_epi64(tmp0, tmp0); - - tmp1 = _mm_mulhrs_epi16(in1, stg2_2); - tmp2 = _mm_mulhrs_epi16(in1, stg2_3); - stp2_2 = _mm_unpacklo_epi64(tmp2, tmp1); - - tmp0 = _mm_add_epi16(stp1_4, stp1_5); - tmp1 = _mm_sub_epi16(stp1_4, stp1_5); - - stp2_4 = tmp0; - stp2_5 = _mm_unpacklo_epi64(tmp1, zero); - stp2_6 = _mm_unpackhi_epi64(tmp1, zero); - - tmp0 = _mm_unpacklo_epi16(stp2_5, stp2_6); - tmp1 = _mm_madd_epi16(tmp0, stg3_0); - tmp2 = _mm_madd_epi16(tmp0, stk2_0); // stg3_1 = stk2_0 - - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - - stp1_5 = _mm_packs_epi32(tmp1, tmp2); - - // Stage3 - tmp2 = _mm_add_epi16(stp2_0, stp2_2); - tmp3 = _mm_sub_epi16(stp2_0, stp2_2); + __m128i io[8]; - stp1_2 = _mm_unpackhi_epi64(tmp3, tmp2); - stp1_3 = _mm_unpacklo_epi64(tmp3, tmp2); - - // Stage4 - tmp0 = _mm_add_epi16(stp1_3, stp2_4); - tmp1 = _mm_add_epi16(stp1_2, stp1_5); - tmp2 = _mm_sub_epi16(stp1_3, stp2_4); - tmp3 = _mm_sub_epi16(stp1_2, stp1_5); - - TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) - - /* Stage1 */ - stp1_4 = _mm_mulhrs_epi16(in1, stg1_0); - stp1_7 = _mm_mulhrs_epi16(in1, stg1_1); - stp1_5 = _mm_mulhrs_epi16(in3, stg1_2); - stp1_6 = _mm_mulhrs_epi16(in3, stg1_3); - - /* Stage2 */ - stp2_0 = _mm_mulhrs_epi16(in0, stg2_0); - stp2_1 = _mm_mulhrs_epi16(in0, stg2_0); - - stp2_2 = _mm_mulhrs_epi16(in2, stg2_2); - stp2_3 = _mm_mulhrs_epi16(in2, stg2_3); - - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); - - /* Stage3 */ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); - - tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5); - tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5); - - tmp2 = _mm_madd_epi16(tmp0, stk2_0); - tmp3 = _mm_madd_epi16(tmp1, stk2_0); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - stp1_6 = _mm_packs_epi32(tmp2, tmp3); - - tmp2 = _mm_madd_epi16(tmp0, stk2_1); - tmp3 = _mm_madd_epi16(tmp1, stk2_1); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - stp1_5 = _mm_packs_epi32(tmp2, tmp3); - - /* Stage4 */ - in0 = _mm_add_epi16(stp1_0, stp2_7); - in1 = _mm_add_epi16(stp1_1, stp1_6); - in2 = _mm_add_epi16(stp1_2, stp1_5); - in3 = _mm_add_epi16(stp1_3, stp2_4); - in4 = _mm_sub_epi16(stp1_3, stp2_4); - in5 = _mm_sub_epi16(stp1_2, stp1_5); - in6 = _mm_sub_epi16(stp1_1, stp1_6); - in7 = _mm_sub_epi16(stp1_0, stp2_7); - - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); - - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); -} + io[0] = load_input_data4(input + 0 * 8); + io[1] = load_input_data4(input + 1 * 8); + io[2] = load_input_data4(input + 2 * 8); + io[3] = load_input_data4(input + 3 * 8); -// Only do addition and subtraction butterfly, size = 16, 32 -static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out, - int size) { - int i = 0; - const int num = size >> 1; - const int bound = size - 1; - while (i < num) { - out[i] = _mm_add_epi16(in[i], in[bound - i]); - out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]); - i++; - } + idct8x8_12_add_kernel_ssse3(io); + write_buffer_8x8(io, dest, stride); } -#define BUTTERFLY_PAIR(x0, x1, co0, co1) \ - do { \ - tmp0 = _mm_madd_epi16(x0, co0); \ - tmp1 = _mm_madd_epi16(x1, co0); \ - tmp2 = _mm_madd_epi16(x0, co1); \ - tmp3 = _mm_madd_epi16(x1, co1); \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - } while (0) - -static INLINE void butterfly(const __m128i *x0, const __m128i *x1, - const __m128i *c0, const __m128i *c1, __m128i *y0, - __m128i *y1) { - __m128i tmp0, tmp1, tmp2, tmp3, u0, u1; - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); +// Group the coefficient calculation into smaller functions to prevent stack +// spillover in 32x32 idct optimizations: +// quarter_1: 0-7 +// quarter_2: 8-15 +// quarter_3_4: 16-23, 24-31 - u0 = _mm_unpacklo_epi16(*x0, *x1); - u1 = _mm_unpackhi_epi16(*x0, *x1); - BUTTERFLY_PAIR(u0, u1, *c0, *c1); - *y0 = _mm_packs_epi32(tmp0, tmp1); - *y1 = _mm_packs_epi32(tmp2, tmp3); +// For each 8x32 block __m128i in[32], +// Input with index, 0, 4 +// output pixels: 0-7 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + + // stage 4 + step2[0] = partial_butterfly_cospi16_ssse3(in[0]); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[0]; + step1[2] = step2[0]; + step1[3] = step2[0]; + step1[4] = step2[4]; + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi16(step1[0], step1[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step1[4]); + out[4] = _mm_sub_epi16(step1[3], step1[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step1[7]); } -static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0, - const __m128i *c1) { - __m128i tmp0, tmp1, tmp2, tmp3, u0, u1; - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - - u0 = _mm_unpacklo_epi16(*x0, *x1); - u1 = _mm_unpackhi_epi16(*x0, *x1); - BUTTERFLY_PAIR(u0, u1, *c0, *c1); - *x0 = _mm_packs_epi32(tmp0, tmp1); - *x1 = _mm_packs_epi32(tmp2, tmp3); +// For each 8x32 block __m128i in[32], +// Input with index, 2, 6 +// output pixels: 8-15 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[14] = step2[15]; + step1[15] = step2[15]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + + idct32_8x32_quarter_2_stage_4_to_6(step1, out); } -static void idct32_34_first_half(const __m128i *in, __m128i *stp1) { - const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); - const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); - const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); - const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); - - const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); - const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i u0, u1, u2, u3, u4, u5, u6, u7; - __m128i x0, x1, x4, x5, x6, x7; - __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; - - // phase 1 - - // 0, 15 - u2 = _mm_mulhrs_epi16(in[2], stk2_1); // stp2_15 - u3 = _mm_mulhrs_epi16(in[6], stk2_7); // stp2_12 - v15 = _mm_add_epi16(u2, u3); - // in[0], in[4] - x0 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[0] - x7 = _mm_mulhrs_epi16(in[4], stk3_1); // stp1[7] - v0 = _mm_add_epi16(x0, x7); // stp2_0 - stp1[0] = _mm_add_epi16(v0, v15); - stp1[15] = _mm_sub_epi16(v0, v15); - - // in[2], in[6] - u0 = _mm_mulhrs_epi16(in[2], stk2_0); // stp2_8 - u1 = _mm_mulhrs_epi16(in[6], stk2_6); // stp2_11 - butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5); // stp2_9, stp2_14 - butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7); // stp2_10, stp2_13 - - v8 = _mm_add_epi16(u0, u1); - v9 = _mm_add_epi16(u4, u6); - v10 = _mm_sub_epi16(u4, u6); - v11 = _mm_sub_epi16(u0, u1); - v12 = _mm_sub_epi16(u2, u3); - v13 = _mm_sub_epi16(u5, u7); - v14 = _mm_add_epi16(u5, u7); - - butterfly_self(&v10, &v13, &stg6_0, &stg4_0); - butterfly_self(&v11, &v12, &stg6_0, &stg4_0); - - // 1, 14 - x1 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[1], stk4_1 = stk4_0 - // stp1[2] = stp1[0], stp1[3] = stp1[1] - x4 = _mm_mulhrs_epi16(in[4], stk3_0); // stp1[4] - butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6); - v1 = _mm_add_epi16(x1, x6); // stp2_1 - v2 = _mm_add_epi16(x0, x5); // stp2_2 - stp1[1] = _mm_add_epi16(v1, v14); - stp1[14] = _mm_sub_epi16(v1, v14); - - stp1[2] = _mm_add_epi16(v2, v13); - stp1[13] = _mm_sub_epi16(v2, v13); - - v3 = _mm_add_epi16(x1, x4); // stp2_3 - v4 = _mm_sub_epi16(x1, x4); // stp2_4 - - v5 = _mm_sub_epi16(x0, x5); // stp2_5 - - v6 = _mm_sub_epi16(x1, x6); // stp2_6 - v7 = _mm_sub_epi16(x0, x7); // stp2_7 - stp1[3] = _mm_add_epi16(v3, v12); - stp1[12] = _mm_sub_epi16(v3, v12); - - stp1[6] = _mm_add_epi16(v6, v9); - stp1[9] = _mm_sub_epi16(v6, v9); - - stp1[7] = _mm_add_epi16(v7, v8); - stp1[8] = _mm_sub_epi16(v7, v8); - - stp1[4] = _mm_add_epi16(v4, v11); - stp1[11] = _mm_sub_epi16(v4, v11); - - stp1[5] = _mm_add_epi16(v5, v10); - stp1[10] = _mm_sub_epi16(v5, v10); +static INLINE void idct32_34_8x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + idct32_34_8x32_quarter_1(in, temp); + idct32_34_8x32_quarter_2(in, temp); + // stage 7 + add_sub_butterfly(temp, out, 16); } -static void idct32_34_second_half(const __m128i *in, __m128i *stp1) { - const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64); - const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64); - const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64); - const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64); - const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64); - const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64); - const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64); - const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i v16, v17, v18, v19, v20, v21, v22, v23; - __m128i v24, v25, v26, v27, v28, v29, v30, v31; - __m128i u16, u17, u18, u19, u20, u21, u22, u23; - __m128i u24, u25, u26, u27, u28, u29, u30, u31; - - v16 = _mm_mulhrs_epi16(in[1], stk1_0); - v31 = _mm_mulhrs_epi16(in[1], stk1_1); - - v19 = _mm_mulhrs_epi16(in[7], stk1_6); - v28 = _mm_mulhrs_epi16(in[7], stk1_7); - - v20 = _mm_mulhrs_epi16(in[5], stk1_8); - v27 = _mm_mulhrs_epi16(in[5], stk1_9); - - v23 = _mm_mulhrs_epi16(in[3], stk1_14); - v24 = _mm_mulhrs_epi16(in[3], stk1_15); - - butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30); - butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29); - butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26); - butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25); - - u16 = _mm_add_epi16(v16, v19); - u17 = _mm_add_epi16(v17, v18); - u18 = _mm_sub_epi16(v17, v18); - u19 = _mm_sub_epi16(v16, v19); - u20 = _mm_sub_epi16(v23, v20); - u21 = _mm_sub_epi16(v22, v21); - u22 = _mm_add_epi16(v22, v21); - u23 = _mm_add_epi16(v23, v20); - u24 = _mm_add_epi16(v24, v27); - u27 = _mm_sub_epi16(v24, v27); - u25 = _mm_add_epi16(v25, v26); - u26 = _mm_sub_epi16(v25, v26); - u28 = _mm_sub_epi16(v31, v28); - u31 = _mm_add_epi16(v28, v31); - u29 = _mm_sub_epi16(v30, v29); - u30 = _mm_add_epi16(v29, v30); - - butterfly_self(&u18, &u29, &stg4_4, &stg4_5); - butterfly_self(&u19, &u28, &stg4_4, &stg4_5); - butterfly_self(&u20, &u27, &stg4_6, &stg4_4); - butterfly_self(&u21, &u26, &stg4_6, &stg4_4); - - stp1[16] = _mm_add_epi16(u16, u23); - stp1[23] = _mm_sub_epi16(u16, u23); - - stp1[17] = _mm_add_epi16(u17, u22); - stp1[22] = _mm_sub_epi16(u17, u22); - - stp1[18] = _mm_add_epi16(u18, u21); - stp1[21] = _mm_sub_epi16(u18, u21); - - stp1[19] = _mm_add_epi16(u19, u20); - stp1[20] = _mm_sub_epi16(u19, u20); - - stp1[24] = _mm_sub_epi16(u31, u24); - stp1[31] = _mm_add_epi16(u24, u31); - - stp1[25] = _mm_sub_epi16(u30, u25); - stp1[30] = _mm_add_epi16(u25, u30); - - stp1[26] = _mm_sub_epi16(u29, u26); - stp1[29] = _mm_add_epi16(u26, u29); +// For each 8x32 block __m128i in[32], +// Input with odd index, 1, 3, 5, 7 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32]; + + // stage 1 + partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19], + &step1[28]); + partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23], + &step1[24]); + + // stage 3 + butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + + idct32_8x32_quarter_3_4_stage_4_to_7(step1, out); +} - stp1[27] = _mm_sub_epi16(u28, u27); - stp1[28] = _mm_add_epi16(u27, u28); +void idct32_34_8x32_ssse3(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[32]*/) { + __m128i temp[32]; - butterfly_self(&stp1[20], &stp1[27], &stg6_0, &stg4_0); - butterfly_self(&stp1[21], &stp1[26], &stg6_0, &stg4_0); - butterfly_self(&stp1[22], &stp1[25], &stg6_0, &stg4_0); - butterfly_self(&stp1[23], &stp1[24], &stg6_0, &stg4_0); + idct32_34_8x32_quarter_1_2(in, temp); + idct32_34_8x32_quarter_3_4(in, temp); + // final stage + add_sub_butterfly(temp, out, 32); } // Only upper-left 8x8 has non-zero coeff void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - __m128i in[32], col[32]; - __m128i stp1[32]; + __m128i io[32], col[32]; int i; // Load input data. Only need to load the top left 8x8 block. - in[0] = load_input_data(input); - in[1] = load_input_data(input + 32); - in[2] = load_input_data(input + 64); - in[3] = load_input_data(input + 96); - in[4] = load_input_data(input + 128); - in[5] = load_input_data(input + 160); - in[6] = load_input_data(input + 192); - in[7] = load_input_data(input + 224); + load_transpose_16bit_8x8(input, 32, io); + idct32_34_8x32_ssse3(io, col); - array_transpose_8x8(in, in); - idct32_34_first_half(in, stp1); - idct32_34_second_half(in, stp1); - - // 1_D: Store 32 intermediate results for each 8x32 block. - add_sub_butterfly(stp1, col, 32); - for (i = 0; i < 4; i++) { + for (i = 0; i < 32; i += 8) { int j; - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + i * 8, in); - idct32_34_first_half(in, stp1); - idct32_34_second_half(in, stp1); + transpose_16bit_8x8(col + i, io); + idct32_34_8x32_ssse3(io, io); - // 2_D: Calculate the results and store them to destination. - add_sub_butterfly(stp1, in, 32); for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); + write_buffer_8x1(dest + j * stride, io[j]); } dest += 8; } } -// in0[16] represents the left 8x16 block -// in1[16] represents the right 8x16 block -static void load_buffer_16x16(const tran_low_t *input, __m128i *in0, - __m128i *in1) { - int i; - for (i = 0; i < 16; i++) { - in0[i] = load_input_data(input); - in1[i] = load_input_data(input + 8); - input += 32; - } -} - -static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0, - __m128i *out1) { - array_transpose_8x8(in0, out0); - array_transpose_8x8(&in0[8], out1); - array_transpose_8x8(in1, &out0[8]); - array_transpose_8x8(&in1[8], &out1[8]); -} - -// Group the coefficient calculation into smaller functions -// to prevent stack spillover: -// quarter_1: 0-7 -// quarter_2: 8-15 -// quarter_3_4: 16-23, 24-31 -static void idct32_8x32_135_quarter_1(const __m128i *in /*in[16]*/, - __m128i *out /*out[8]*/) { - __m128i u0, u1, u2, u3, u4, u5, u6, u7; - __m128i v0, v1, v2, v3, v4, v5, v6, v7; - - { - const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); - const __m128i stk4_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); - const __m128i stk4_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); - u0 = _mm_mulhrs_epi16(in[0], stk4_0); - u2 = _mm_mulhrs_epi16(in[8], stk4_2); - u3 = _mm_mulhrs_epi16(in[8], stk4_3); - u1 = u0; - } - - v0 = _mm_add_epi16(u0, u3); - v1 = _mm_add_epi16(u1, u2); - v2 = _mm_sub_epi16(u1, u2); - v3 = _mm_sub_epi16(u0, u3); - - { - const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); - const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); - const __m128i stk3_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); - const __m128i stk3_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); - u4 = _mm_mulhrs_epi16(in[4], stk3_0); - u7 = _mm_mulhrs_epi16(in[4], stk3_1); - u5 = _mm_mulhrs_epi16(in[12], stk3_2); - u6 = _mm_mulhrs_epi16(in[12], stk3_3); - } - - v4 = _mm_add_epi16(u4, u5); - v5 = _mm_sub_epi16(u4, u5); - v6 = _mm_sub_epi16(u7, u6); - v7 = _mm_add_epi16(u7, u6); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6); - } - - out[0] = _mm_add_epi16(v0, v7); - out[1] = _mm_add_epi16(v1, v6); - out[2] = _mm_add_epi16(v2, v5); - out[3] = _mm_add_epi16(v3, v4); - out[4] = _mm_sub_epi16(v3, v4); - out[5] = _mm_sub_epi16(v2, v5); - out[6] = _mm_sub_epi16(v1, v6); - out[7] = _mm_sub_epi16(v0, v7); +// For each 8x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12 +// output pixels: 0-7 in __m128i out[32] +static INLINE void idct32_135_8x32_quarter_1(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + partial_butterfly_ssse3(in[12], -cospi_20_64, cospi_12_64, &step1[5], + &step1[6]); + + // stage 4 + step2[0] = partial_butterfly_cospi16_ssse3(in[0]); + partial_butterfly_ssse3(in[8], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[0], step2[2]); + step1[2] = _mm_sub_epi16(step2[0], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + step1[4] = step2[4]; + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi16(step1[0], step1[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step1[4]); + out[4] = _mm_sub_epi16(step1[3], step1[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step1[7]); } -static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/, - __m128i *out /*out[8]*/) { - __m128i u8, u9, u10, u11, u12, u13, u14, u15; - __m128i v8, v9, v10, v11, v12, v13, v14, v15; - - { - const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); - const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); - const __m128i stk2_2 = pair_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64); - const __m128i stk2_3 = pair_set_epi16(2 * cospi_14_64, 2 * cospi_14_64); - const __m128i stk2_4 = pair_set_epi16(2 * cospi_22_64, 2 * cospi_22_64); - const __m128i stk2_5 = pair_set_epi16(2 * cospi_10_64, 2 * cospi_10_64); - const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); - const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); - u8 = _mm_mulhrs_epi16(in[2], stk2_0); - u15 = _mm_mulhrs_epi16(in[2], stk2_1); - u9 = _mm_mulhrs_epi16(in[14], stk2_2); - u14 = _mm_mulhrs_epi16(in[14], stk2_3); - u10 = _mm_mulhrs_epi16(in[10], stk2_4); - u13 = _mm_mulhrs_epi16(in[10], stk2_5); - u11 = _mm_mulhrs_epi16(in[6], stk2_6); - u12 = _mm_mulhrs_epi16(in[6], stk2_7); - } - - v8 = _mm_add_epi16(u8, u9); - v9 = _mm_sub_epi16(u8, u9); - v10 = _mm_sub_epi16(u11, u10); - v11 = _mm_add_epi16(u11, u10); - v12 = _mm_add_epi16(u12, u13); - v13 = _mm_sub_epi16(u12, u13); - v14 = _mm_sub_epi16(u15, u14); - v15 = _mm_add_epi16(u15, u14); - - { - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&v9, &v14, &stg4_4, &stg4_5); - butterfly_self(&v10, &v13, &stg4_6, &stg4_4); - } - - out[0] = _mm_add_epi16(v8, v11); - out[1] = _mm_add_epi16(v9, v10); - out[2] = _mm_sub_epi16(v9, v10); - out[3] = _mm_sub_epi16(v8, v11); - out[4] = _mm_sub_epi16(v15, v12); - out[5] = _mm_sub_epi16(v14, v13); - out[6] = _mm_add_epi16(v14, v13); - out[7] = _mm_add_epi16(v15, v12); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); - butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); - } +// For each 8x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14 +// output pixels: 8-15 in __m128i out[32] +static INLINE void idct32_135_8x32_quarter_2(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + partial_butterfly_ssse3(in[14], -cospi_18_64, cospi_14_64, &step2[9], + &step2[14]); + partial_butterfly_ssse3(in[10], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi16(step2[8], step2[9]); + step1[9] = _mm_sub_epi16(step2[8], step2[9]); + step1[10] = _mm_sub_epi16(step2[11], step2[10]); + step1[11] = _mm_add_epi16(step2[11], step2[10]); + step1[12] = _mm_add_epi16(step2[12], step2[13]); + step1[13] = _mm_sub_epi16(step2[12], step2[13]); + step1[14] = _mm_sub_epi16(step2[15], step2[14]); + step1[15] = _mm_add_epi16(step2[15], step2[14]); + + idct32_8x32_quarter_2_stage_4_to_6(step1, out); } -// 8x32 block even indexed 8 inputs of in[16], -// output first half 16 to out[32] -static void idct32_8x32_quarter_1_2(const __m128i *in /*in[16]*/, - __m128i *out /*out[32]*/) { +static INLINE void idct32_135_8x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { __m128i temp[16]; - idct32_8x32_135_quarter_1(in, temp); - idct32_8x32_135_quarter_2(in, &temp[8]); + idct32_135_8x32_quarter_1(in, temp); + idct32_135_8x32_quarter_2(in, temp); + // stage 7 add_sub_butterfly(temp, out, 16); } -// 8x32 block odd indexed 8 inputs of in[16], -// output second half 16 to out[32] -static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/, - __m128i *out /*out[32]*/) { - __m128i v16, v17, v18, v19, v20, v21, v22, v23; - __m128i v24, v25, v26, v27, v28, v29, v30, v31; - __m128i u16, u17, u18, u19, u20, u21, u22, u23; - __m128i u24, u25, u26, u27, u28, u29, u30, u31; - - { - const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64); - const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64); - const __m128i stk1_2 = pair_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64); - const __m128i stk1_3 = pair_set_epi16(2 * cospi_15_64, 2 * cospi_15_64); - - const __m128i stk1_4 = pair_set_epi16(2 * cospi_23_64, 2 * cospi_23_64); - const __m128i stk1_5 = pair_set_epi16(2 * cospi_9_64, 2 * cospi_9_64); - const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64); - const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64); - const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64); - const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64); - const __m128i stk1_10 = pair_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64); - const __m128i stk1_11 = pair_set_epi16(2 * cospi_11_64, 2 * cospi_11_64); - - const __m128i stk1_12 = pair_set_epi16(2 * cospi_19_64, 2 * cospi_19_64); - const __m128i stk1_13 = pair_set_epi16(2 * cospi_13_64, 2 * cospi_13_64); - const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64); - const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64); - u16 = _mm_mulhrs_epi16(in[1], stk1_0); - u31 = _mm_mulhrs_epi16(in[1], stk1_1); - u17 = _mm_mulhrs_epi16(in[15], stk1_2); - u30 = _mm_mulhrs_epi16(in[15], stk1_3); - - u18 = _mm_mulhrs_epi16(in[9], stk1_4); - u29 = _mm_mulhrs_epi16(in[9], stk1_5); - u19 = _mm_mulhrs_epi16(in[7], stk1_6); - u28 = _mm_mulhrs_epi16(in[7], stk1_7); - - u20 = _mm_mulhrs_epi16(in[5], stk1_8); - u27 = _mm_mulhrs_epi16(in[5], stk1_9); - u21 = _mm_mulhrs_epi16(in[11], stk1_10); - u26 = _mm_mulhrs_epi16(in[11], stk1_11); - - u22 = _mm_mulhrs_epi16(in[13], stk1_12); - u25 = _mm_mulhrs_epi16(in[13], stk1_13); - u23 = _mm_mulhrs_epi16(in[3], stk1_14); - u24 = _mm_mulhrs_epi16(in[3], stk1_15); - } - - v16 = _mm_add_epi16(u16, u17); - v17 = _mm_sub_epi16(u16, u17); - v18 = _mm_sub_epi16(u19, u18); - v19 = _mm_add_epi16(u19, u18); - - v20 = _mm_add_epi16(u20, u21); - v21 = _mm_sub_epi16(u20, u21); - v22 = _mm_sub_epi16(u23, u22); - v23 = _mm_add_epi16(u23, u22); - - v24 = _mm_add_epi16(u24, u25); - v25 = _mm_sub_epi16(u24, u25); - v26 = _mm_sub_epi16(u27, u26); - v27 = _mm_add_epi16(u27, u26); - - v28 = _mm_add_epi16(u28, u29); - v29 = _mm_sub_epi16(u28, u29); - v30 = _mm_sub_epi16(u31, u30); - v31 = _mm_add_epi16(u31, u30); - - { - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - butterfly_self(&v17, &v30, &stg3_4, &stg3_5); - butterfly_self(&v18, &v29, &stg3_6, &stg3_4); - butterfly_self(&v21, &v26, &stg3_8, &stg3_9); - butterfly_self(&v22, &v25, &stg3_10, &stg3_8); - } - - u16 = _mm_add_epi16(v16, v19); - u17 = _mm_add_epi16(v17, v18); - u18 = _mm_sub_epi16(v17, v18); - u19 = _mm_sub_epi16(v16, v19); - u20 = _mm_sub_epi16(v23, v20); - u21 = _mm_sub_epi16(v22, v21); - u22 = _mm_add_epi16(v22, v21); - u23 = _mm_add_epi16(v23, v20); - - u24 = _mm_add_epi16(v24, v27); - u25 = _mm_add_epi16(v25, v26); - u26 = _mm_sub_epi16(v25, v26); - u27 = _mm_sub_epi16(v24, v27); - u28 = _mm_sub_epi16(v31, v28); - u29 = _mm_sub_epi16(v30, v29); - u30 = _mm_add_epi16(v29, v30); - u31 = _mm_add_epi16(v28, v31); - - { - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&u18, &u29, &stg4_4, &stg4_5); - butterfly_self(&u19, &u28, &stg4_4, &stg4_5); - butterfly_self(&u20, &u27, &stg4_6, &stg4_4); - butterfly_self(&u21, &u26, &stg4_6, &stg4_4); - } - - out[0] = _mm_add_epi16(u16, u23); - out[1] = _mm_add_epi16(u17, u22); - out[2] = _mm_add_epi16(u18, u21); - out[3] = _mm_add_epi16(u19, u20); - v20 = _mm_sub_epi16(u19, u20); - v21 = _mm_sub_epi16(u18, u21); - v22 = _mm_sub_epi16(u17, u22); - v23 = _mm_sub_epi16(u16, u23); - - v24 = _mm_sub_epi16(u31, u24); - v25 = _mm_sub_epi16(u30, u25); - v26 = _mm_sub_epi16(u29, u26); - v27 = _mm_sub_epi16(u28, u27); - out[12] = _mm_add_epi16(u27, u28); - out[13] = _mm_add_epi16(u26, u29); - out[14] = _mm_add_epi16(u25, u30); - out[15] = _mm_add_epi16(u24, u31); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]); - butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]); - butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]); - butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]); - } -} - -// 8x16 block, input __m128i in[16], output __m128i in[32] -static void idct32_8x32_135(__m128i *in /*in[32]*/) { - __m128i out[32]; - idct32_8x32_quarter_1_2(in, out); - idct32_8x32_quarter_3_4(in, &out[16]); - add_sub_butterfly(out, in, 32); -} - -static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) { - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - int j = 0; - while (j < 32) { - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding); - - in[j] = _mm_srai_epi16(in[j], 6); - in[j + 1] = _mm_srai_epi16(in[j + 1], 6); - - RECON_AND_STORE(dst, in[j]); - dst += stride; - RECON_AND_STORE(dst, in[j + 1]); - dst += stride; - j += 2; - } -} - -static INLINE void recon_and_store(__m128i *in0, __m128i *in1, uint8_t *dest, - int stride) { - store_buffer_8x32(in0, dest, stride); - store_buffer_8x32(in1, dest + 8, stride); -} - -static INLINE void idct32_135(__m128i *col0, __m128i *col1) { - idct32_8x32_135(col0); - idct32_8x32_135(col1); -} - -typedef enum { left_16, right_16 } ColsIndicator; - -static void transpose_and_copy_16x16(__m128i *in0, __m128i *in1, __m128i *store, - ColsIndicator cols) { - switch (cols) { - case left_16: { - int i; - array_transpose_16x16(in0, in1); - for (i = 0; i < 16; ++i) { - store[i] = in0[16 + i]; - store[16 + i] = in1[16 + i]; - } - break; - } - case right_16: { - array_transpose_16x16_2(store, &store[16], in0, in1); - break; - } - default: { assert(0); } - } -} - -// Only upper-left 16x16 has non-zero coeff -void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, - int stride) { - // Each array represents an 8x32 block - __m128i col0[32], col1[32]; - // This array represents a 16x16 block - __m128i temp[32]; - - // Load input data. Only need to load the top left 16x16 block. - load_buffer_16x16(input, col0, col1); - - // columns - array_transpose_16x16(col0, col1); - idct32_135(col0, col1); - - // rows - transpose_and_copy_16x16(col0, col1, temp, left_16); - idct32_135(col0, col1); - recon_and_store(col0, col1, dest, stride); - - transpose_and_copy_16x16(col0, col1, temp, right_16); - idct32_135(col0, col1); - recon_and_store(col0, col1, dest + 16, stride); -} - -// For each 8x32 block __m128i in[32], -// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 -// output pixels: 8-15 in __m128i in[32] -static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/, - __m128i *out /*out[16]*/) { - __m128i u8, u9, u10, u11, u12, u13, u14, u15; // stp2_ - __m128i v8, v9, v10, v11, v12, v13, v14, v15; // stp1_ - - { - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15); - butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14); - } - - v8 = _mm_add_epi16(u8, u9); - v9 = _mm_sub_epi16(u8, u9); - v14 = _mm_sub_epi16(u15, u14); - v15 = _mm_add_epi16(u15, u14); - - { - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13); - butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12); - } - - v10 = _mm_sub_epi16(u11, u10); - v11 = _mm_add_epi16(u11, u10); - v12 = _mm_add_epi16(u12, u13); - v13 = _mm_sub_epi16(u12, u13); - - { - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&v9, &v14, &stg4_4, &stg4_5); - butterfly_self(&v10, &v13, &stg4_6, &stg4_4); - } - - out[0] = _mm_add_epi16(v8, v11); - out[1] = _mm_add_epi16(v9, v10); - out[6] = _mm_add_epi16(v14, v13); - out[7] = _mm_add_epi16(v15, v12); - - out[2] = _mm_sub_epi16(v9, v10); - out[3] = _mm_sub_epi16(v8, v11); - out[4] = _mm_sub_epi16(v15, v12); - out[5] = _mm_sub_epi16(v14, v13); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); - butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); - } -} - -// For each 8x32 block __m128i in[32], -// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 -// output pixels: 0-7 in __m128i in[32] -static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/, - __m128i *out /*out[8]*/) { - __m128i u0, u1, u2, u3, u4, u5, u6, u7; // stp1_ - __m128i v0, v1, v2, v3, v4, v5, v6, v7; // stp2_ - - { - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7); - butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6); - } - - v4 = _mm_add_epi16(u4, u5); - v5 = _mm_sub_epi16(u4, u5); - v6 = _mm_sub_epi16(u7, u6); - v7 = _mm_add_epi16(u7, u6); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6); - - butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1); - butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3); - } - - v0 = _mm_add_epi16(u0, u3); - v1 = _mm_add_epi16(u1, u2); - v2 = _mm_sub_epi16(u1, u2); - v3 = _mm_sub_epi16(u0, u3); - - out[0] = _mm_add_epi16(v0, v7); - out[1] = _mm_add_epi16(v1, v6); - out[2] = _mm_add_epi16(v2, v5); - out[3] = _mm_add_epi16(v3, v4); - out[4] = _mm_sub_epi16(v3, v4); - out[5] = _mm_sub_epi16(v2, v5); - out[6] = _mm_sub_epi16(v1, v6); - out[7] = _mm_sub_epi16(v0, v7); -} - // For each 8x32 block __m128i in[32], // Input with odd index, -// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 -// output pixels: 16-23, 24-31 in __m128i in[32] -// We avoid hide an offset, 16, inside this function. So we output 0-15 into -// array out[16] -static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/, - __m128i *out /*out[16]*/) { - __m128i v16, v17, v18, v19, v20, v21, v22, v23; - __m128i v24, v25, v26, v27, v28, v29, v30, v31; - __m128i u16, u17, u18, u19, u20, u21, u22, u23; - __m128i u24, u25, u26, u27, u28, u29, u30, u31; - - { - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31); - butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30); - butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29); - butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28); - - butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27); - butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26); - - butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25); - butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24); - } - - v16 = _mm_add_epi16(u16, u17); - v17 = _mm_sub_epi16(u16, u17); - v18 = _mm_sub_epi16(u19, u18); - v19 = _mm_add_epi16(u19, u18); - - v20 = _mm_add_epi16(u20, u21); - v21 = _mm_sub_epi16(u20, u21); - v22 = _mm_sub_epi16(u23, u22); - v23 = _mm_add_epi16(u23, u22); - - v24 = _mm_add_epi16(u24, u25); - v25 = _mm_sub_epi16(u24, u25); - v26 = _mm_sub_epi16(u27, u26); - v27 = _mm_add_epi16(u27, u26); - - v28 = _mm_add_epi16(u28, u29); - v29 = _mm_sub_epi16(u28, u29); - v30 = _mm_sub_epi16(u31, u30); - v31 = _mm_add_epi16(u31, u30); - - { - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - butterfly_self(&v17, &v30, &stg3_4, &stg3_5); - butterfly_self(&v18, &v29, &stg3_6, &stg3_4); - butterfly_self(&v21, &v26, &stg3_8, &stg3_9); - butterfly_self(&v22, &v25, &stg3_10, &stg3_8); - } - - u16 = _mm_add_epi16(v16, v19); - u17 = _mm_add_epi16(v17, v18); - u18 = _mm_sub_epi16(v17, v18); - u19 = _mm_sub_epi16(v16, v19); - u20 = _mm_sub_epi16(v23, v20); - u21 = _mm_sub_epi16(v22, v21); - u22 = _mm_add_epi16(v22, v21); - u23 = _mm_add_epi16(v23, v20); - - u24 = _mm_add_epi16(v24, v27); - u25 = _mm_add_epi16(v25, v26); - u26 = _mm_sub_epi16(v25, v26); - u27 = _mm_sub_epi16(v24, v27); - - u28 = _mm_sub_epi16(v31, v28); - u29 = _mm_sub_epi16(v30, v29); - u30 = _mm_add_epi16(v29, v30); - u31 = _mm_add_epi16(v28, v31); - - { - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&u18, &u29, &stg4_4, &stg4_5); - butterfly_self(&u19, &u28, &stg4_4, &stg4_5); - butterfly_self(&u20, &u27, &stg4_6, &stg4_4); - butterfly_self(&u21, &u26, &stg4_6, &stg4_4); - } - - out[0] = _mm_add_epi16(u16, u23); - out[1] = _mm_add_epi16(u17, u22); - out[2] = _mm_add_epi16(u18, u21); - out[3] = _mm_add_epi16(u19, u20); - out[4] = _mm_sub_epi16(u19, u20); - out[5] = _mm_sub_epi16(u18, u21); - out[6] = _mm_sub_epi16(u17, u22); - out[7] = _mm_sub_epi16(u16, u23); - - out[8] = _mm_sub_epi16(u31, u24); - out[9] = _mm_sub_epi16(u30, u25); - out[10] = _mm_sub_epi16(u29, u26); - out[11] = _mm_sub_epi16(u28, u27); - out[12] = _mm_add_epi16(u27, u28); - out[13] = _mm_add_epi16(u26, u29); - out[14] = _mm_add_epi16(u25, u30); - out[15] = _mm_add_epi16(u24, u31); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0); - butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0); - butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0); - butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0); - } -} - -static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/, - __m128i *out /*out[32]*/) { - __m128i temp[16]; - idct32_full_8x32_quarter_1(in, temp); - idct32_full_8x32_quarter_2(in, &temp[8]); - add_sub_butterfly(temp, out, 16); +// 1, 3, 5, 7, 9, 11, 13, 15 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void idct32_135_8x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + partial_butterfly_ssse3(in[15], -cospi_17_64, cospi_15_64, &step1[17], + &step1[30]); + partial_butterfly_ssse3(in[9], cospi_23_64, cospi_9_64, &step1[18], + &step1[29]); + partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19], + &step1[28]); + + partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + partial_butterfly_ssse3(in[11], -cospi_21_64, cospi_11_64, &step1[21], + &step1[26]); + + partial_butterfly_ssse3(in[13], cospi_19_64, cospi_13_64, &step1[22], + &step1[25]); + partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi16(step1[16], step1[17]); + step2[17] = _mm_sub_epi16(step1[16], step1[17]); + step2[18] = _mm_sub_epi16(step1[19], step1[18]); + step2[19] = _mm_add_epi16(step1[19], step1[18]); + step2[20] = _mm_add_epi16(step1[20], step1[21]); + step2[21] = _mm_sub_epi16(step1[20], step1[21]); + step2[22] = _mm_sub_epi16(step1[23], step1[22]); + step2[23] = _mm_add_epi16(step1[23], step1[22]); + + step2[24] = _mm_add_epi16(step1[24], step1[25]); + step2[25] = _mm_sub_epi16(step1[24], step1[25]); + step2[26] = _mm_sub_epi16(step1[27], step1[26]); + step2[27] = _mm_add_epi16(step1[27], step1[26]); + step2[28] = _mm_add_epi16(step1[28], step1[29]); + step2[29] = _mm_sub_epi16(step1[28], step1[29]); + step2[30] = _mm_sub_epi16(step1[31], step1[30]); + step2[31] = _mm_add_epi16(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + idct32_8x32_quarter_3_4_stage_4_to_7(step1, out); } -static void idct32_full_8x32(const __m128i *in /*in[32]*/, - __m128i *out /*out[32]*/) { +void idct32_135_8x32_ssse3(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[32]*/) { __m128i temp[32]; - idct32_full_8x32_quarter_1_2(in, temp); - idct32_full_8x32_quarter_3_4(in, &temp[16]); + idct32_135_8x32_quarter_1_2(in, temp); + idct32_135_8x32_quarter_3_4(in, temp); + // final stage add_sub_butterfly(temp, out, 32); } -static void load_buffer_8x32(const tran_low_t *input, __m128i *in) { +void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i col[2][32], io[32]; int i; - for (i = 0; i < 8; ++i) { - in[i] = load_input_data(input); - in[i + 8] = load_input_data(input + 8); - in[i + 16] = load_input_data(input + 16); - in[i + 24] = load_input_data(input + 24); - input += 32; - } -} - -void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, - int stride) { - __m128i col[128], in[32]; - int i, j; // rows - for (i = 0; i < 4; ++i) { - load_buffer_8x32(input, in); + for (i = 0; i < 2; i++) { + load_transpose_16bit_8x8(&input[0], 32, &io[0]); + load_transpose_16bit_8x8(&input[8], 32, &io[8]); + idct32_135_8x32_ssse3(io, col[i]); input += 32 << 3; - - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - array_transpose_8x8(in + 16, in + 16); - array_transpose_8x8(in + 24, in + 24); - - idct32_full_8x32(in, col + (i << 5)); } // columns - for (i = 0; i < 4; ++i) { - j = i << 3; - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + j, in); - array_transpose_8x8(col + j + 32, in + 8); - array_transpose_8x8(col + j + 64, in + 16); - array_transpose_8x8(col + j + 96, in + 24); - - idct32_full_8x32(in, in); - store_buffer_8x32(in, dest, stride); + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col[0] + i, io); + transpose_16bit_8x8(col[1] + i, io + 8); + idct32_135_8x32_ssse3(io, io); + store_buffer_8x32(io, dest, stride); dest += 8; } } diff --git a/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h b/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h new file mode 100644 index 000000000..e785c8eda --- /dev/null +++ b/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_INV_TXFM_SSSE3_H_ +#define VPX_DSP_X86_INV_TXFM_SSSE3_H_ + +#include <tmmintrin.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) { + const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64); + const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64); + const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64); + const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64); + const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i cospi_16_64d = _mm_set1_epi16((int16_t)(2 * cospi_16_64)); + const __m128i cospi_28_64d = _mm_set1_epi16((int16_t)(2 * cospi_28_64)); + const __m128i cospi_4_64d = _mm_set1_epi16((int16_t)(2 * cospi_4_64)); + const __m128i cospi_n20_64d = _mm_set1_epi16((int16_t)(-2 * cospi_20_64)); + const __m128i cospi_12_64d = _mm_set1_epi16((int16_t)(2 * cospi_12_64)); + const __m128i cospi_24_64d = _mm_set1_epi16((int16_t)(2 * cospi_24_64)); + const __m128i cospi_8_64d = _mm_set1_epi16((int16_t)(2 * cospi_8_64)); + __m128i step1[8], step2[8], tmp[4]; + + // pass 1 + + transpose_16bit_4x4(io, io); + // io[0]: 00 10 20 30 01 11 21 31 + // io[1]: 02 12 22 32 03 13 23 33 + + // stage 1 + tmp[0] = _mm_unpacklo_epi64(io[0], io[0]); + tmp[1] = _mm_unpackhi_epi64(io[0], io[0]); + tmp[2] = _mm_unpacklo_epi64(io[1], io[1]); + tmp[3] = _mm_unpackhi_epi64(io[1], io[1]); + step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d); // step1 4&7 + step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d); // step1 5&6 + + // stage 2 + step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d); // step2 0&1 + step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d); // step2 3&2 + step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7 + step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6 + step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]); // step2 6 + + // stage 3 + tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]); + step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]); // step1 5&6 + tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1 + tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2 + step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1 + step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0 + + // stage 4 + tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0 + tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1 + tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7 + tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6 + + // pass 2 + + idct8x8_12_transpose_16bit_4x8(tmp, io); + + // stage 1 + step1[4] = _mm_mulhrs_epi16(io[1], cospi_28_64d); + step1[7] = _mm_mulhrs_epi16(io[1], cospi_4_64d); + step1[5] = _mm_mulhrs_epi16(io[3], cospi_n20_64d); + step1[6] = _mm_mulhrs_epi16(io[3], cospi_12_64d); + + // stage 2 + step2[0] = _mm_mulhrs_epi16(io[0], cospi_16_64d); // step2[1] = step2[0] + step2[2] = _mm_mulhrs_epi16(io[2], cospi_24_64d); + step2[3] = _mm_mulhrs_epi16(io[2], cospi_8_64d); + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[0], step2[2]); + step1[2] = _mm_sub_epi16(step2[0], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + + // stage 4 + io[0] = _mm_add_epi16(step1[0], step2[7]); + io[1] = _mm_add_epi16(step1[1], step1[6]); + io[2] = _mm_add_epi16(step1[2], step1[5]); + io[3] = _mm_add_epi16(step1[3], step2[4]); + io[4] = _mm_sub_epi16(step1[3], step2[4]); + io[5] = _mm_sub_epi16(step1[2], step1[5]); + io[6] = _mm_sub_epi16(step1[1], step1[6]); + io[7] = _mm_sub_epi16(step1[0], step2[7]); +} + +void idct32_135_8x32_ssse3(const __m128i *const in, __m128i *const out); + +#endif // VPX_DSP_X86_INV_TXFM_SSSE3_H_ diff --git a/libvpx/vpx_dsp/x86/mem_sse2.h b/libvpx/vpx_dsp/x86/mem_sse2.h new file mode 100644 index 000000000..2ce738fb7 --- /dev/null +++ b/libvpx/vpx_dsp/x86/mem_sse2.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_MEM_SSE2_H_ +#define VPX_DSP_X86_MEM_SSE2_H_ + +#include <emmintrin.h> // SSE2 + +#include "./vpx_config.h" + +static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride, + __m128i *const d) { + d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride)); + d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride)); + d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride)); + d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride)); +} + +static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride, + __m128i *const d) { + load_8bit_4x4(s + 0 * stride, stride, &d[0]); + load_8bit_4x4(s + 4 * stride, stride, &d[4]); +} + +static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride, + __m128i *const d) { + d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride)); + d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride)); + d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride)); + d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride)); +} + +static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride, + __m128i *const d) { + load_8bit_8x4(s + 0 * stride, stride, &d[0]); + load_8bit_8x4(s + 4 * stride, stride, &d[4]); +} + +static INLINE void load_8bit_16x8(const uint8_t *const s, + const ptrdiff_t stride, __m128i *const d) { + d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride)); + d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride)); + d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride)); + d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride)); + d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride)); + d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride)); + d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride)); + d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride)); +} + +static INLINE void loadu_8bit_16x4(const uint8_t *const s, + const ptrdiff_t stride, __m128i *const d) { + d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride)); + d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride)); + d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride)); + d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride)); +} + +static INLINE void loadu_8bit_16x8(const uint8_t *const s, + const ptrdiff_t stride, __m128i *const d) { + loadu_8bit_16x4(s + 0 * stride, stride, &d[0]); + loadu_8bit_16x4(s + 4 * stride, stride, &d[4]); +} + +static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) { + _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s)); +} + +static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d, + const ptrdiff_t stride) { + *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]); + *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]); + *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]); + *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]); +} + +static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d, + const ptrdiff_t stride) { + __m128i ss[4]; + + ss[0] = s; + ss[1] = _mm_srli_si128(s, 4); + ss[2] = _mm_srli_si128(s, 8); + ss[3] = _mm_srli_si128(s, 12); + store_8bit_4x4(ss, d, stride); +} + +static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s, + uint8_t *const d, + const ptrdiff_t stride) { + _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]); + _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]); + _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]); + _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]); +} + +static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d, + const ptrdiff_t stride) { + _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]); + _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]); + _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]); + _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]); + _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]); + _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]); + _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]); + _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]); +} + +static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d, + const ptrdiff_t stride) { + _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]); + _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]); + _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]); + _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]); +} + +#endif // VPX_DSP_X86_MEM_SSE2_H_ diff --git a/libvpx/vpx_dsp/x86/quantize_avx.c b/libvpx/vpx_dsp/x86/quantize_avx.c new file mode 100644 index 000000000..6f4489004 --- /dev/null +++ b/libvpx/vpx_dsp/x86/quantize_avx.c @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#if defined(_MSC_VER) +#include <intrin.h> +#endif +#include <immintrin.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_x86.h" + +void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, + const int16_t *iscan_ptr) { + const __m128i zero = _mm_setzero_si128(); + const __m256i big_zero = _mm256_setzero_si256(); + int index; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i eob = zero, eob0; + + (void)scan_ptr; + (void)skip_block; + assert(!skip_block); + + *eob_ptr = 0; + + load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, + dequant_ptr, &dequant, quant_shift_ptr, &shift); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_test_all_zeros(all_zero, all_zero)) { + _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero); +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (n_coeffs == 16) return; + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff(&qcoeff0, round, quant, shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_tran_low(coeff0, dqcoeff_ptr); + store_tran_low(coeff1, dqcoeff_ptr + 8); + + eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, + zero); + } + + // AC only loop. + for (index = 16; index < n_coeffs; index += 16) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_test_all_zeros(all_zero, all_zero)) { + _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero); +#endif // CONFIG_VP9_HIGHBITDEPTH + continue; + } + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_tran_low(coeff0, dqcoeff_ptr + index); + store_tran_low(coeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, + index, zero); + eob = _mm_max_epi16(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} + +void vpx_quantize_b_32x32_avx( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m256i big_zero = _mm256_setzero_si256(); + int index; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i eob = zero, eob0; + + (void)scan_ptr; + (void)n_coeffs; + (void)skip_block; + assert(!skip_block); + + // Setup global values. + // The 32x32 halves zbin and round. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + // Shift with rounding. + zbin = _mm_add_epi16(zbin, one); + zbin = _mm_srli_epi16(zbin, 1); + // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so + // it is a strict "greater" comparison. + zbin = _mm_sub_epi16(zbin, one); + + round = _mm_load_si128((const __m128i *)round_ptr); + round = _mm_add_epi16(round, one); + round = _mm_srli_epi16(round, 1); + + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + shift = _mm_slli_epi16(shift, 1); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC. + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_test_all_zeros(all_zero, all_zero)) { + _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero); +#endif // CONFIG_VP9_HIGHBITDEPTH + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff(&qcoeff0, round, quant, shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs. + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + // Un-sign to bias rounding like C. + // dequant is almost always negative, so this is probably the backwards way + // to handle the sign. However, it matches the previous assembly. + coeff0 = _mm_abs_epi16(qcoeff0); + coeff1 = _mm_abs_epi16(qcoeff1); + + coeff0 = calculate_dqcoeff(coeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = calculate_dqcoeff(coeff1, dequant); + + // "Divide" by 2. + coeff0 = _mm_srli_epi16(coeff0, 1); + coeff1 = _mm_srli_epi16(coeff1, 1); + + coeff0 = _mm_sign_epi16(coeff0, qcoeff0); + coeff1 = _mm_sign_epi16(coeff1, qcoeff1); + + store_tran_low(coeff0, dqcoeff_ptr); + store_tran_low(coeff1, dqcoeff_ptr + 8); + + eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, + zero); + } + + // AC only loop. + for (index = 16; index < 32 * 32; index += 16) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_test_all_zeros(all_zero, all_zero)) { + _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero); +#endif // CONFIG_VP9_HIGHBITDEPTH + continue; + } + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + coeff0 = _mm_abs_epi16(qcoeff0); + coeff1 = _mm_abs_epi16(qcoeff1); + + coeff0 = calculate_dqcoeff(coeff0, dequant); + coeff1 = calculate_dqcoeff(coeff1, dequant); + + coeff0 = _mm_srli_epi16(coeff0, 1); + coeff1 = _mm_srli_epi16(coeff1, 1); + + coeff0 = _mm_sign_epi16(coeff0, qcoeff0); + coeff1 = _mm_sign_epi16(coeff1, qcoeff1); + + store_tran_low(coeff0, dqcoeff_ptr + index); + store_tran_low(coeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, + index, zero); + eob = _mm_max_epi16(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm b/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm deleted file mode 100644 index 01c41291b..000000000 --- a/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm +++ /dev/null @@ -1,544 +0,0 @@ -; -; Copyright (c) 2015 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -%macro QUANTIZE_FN 2 -cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ - shift, qcoeff, dqcoeff, dequant, \ - eob, scan, iscan - - vzeroupper - - ; If we can skip this block, then just zero the output - cmp skipmp, 0 - jne .blank - -%ifnidn %1, b_32x32 - - ; Special case for ncoeff == 16, as it is frequent and we can save on - ; not setting up a loop. - cmp ncoeffmp, 16 - jne .generic - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Special case of ncoeff == 16 - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -.single: - - movifnidn coeffq, coeffmp - movifnidn zbinq, zbinmp - mova m0, [zbinq] ; m0 = zbin - - ; Get DC and first 15 AC coeffs - in this special case, that is all. -%if CONFIG_VP9_HIGHBITDEPTH - ; coeff stored as 32bit numbers but we process them as 16 bit numbers - mova m9, [coeffq] - packssdw m9, [coeffq+16] ; m9 = c[i] - mova m10, [coeffq+32] - packssdw m10, [coeffq+48] ; m10 = c[i] -%else - mova m9, [coeffq] ; m9 = c[i] - mova m10, [coeffq+16] ; m10 = c[i] -%endif - - mov r0, eobmp ; Output pointer - mov r1, qcoeffmp ; Output pointer - mov r2, dqcoeffmp ; Output pointer - - pxor m5, m5 ; m5 = dedicated zero - - pcmpeqw m4, m4 ; All word lanes -1 - paddw m0, m4 ; m0 = zbin - 1 - - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - punpckhqdq m0, m0 - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin - - ; Check if all coeffs are less than zbin. If yes, we just write zeros - ; to the outputs and we are done. - por m14, m7, m12 - ptest m14, m14 - jnz .single_nonzero - -%if CONFIG_VP9_HIGHBITDEPTH - mova [r1 ], ymm5 - mova [r1+32], ymm5 - mova [r2 ], ymm5 - mova [r2+32], ymm5 -%else - mova [r1], ymm5 - mova [r2], ymm5 -%endif - mov [r0], word 0 - - vzeroupper - RET - -.single_nonzero: - - ; Actual quantization of size 16 block - setup pointers, rounders, etc. - movifnidn r4, roundmp - movifnidn r5, quantmp - mov r3, dequantmp - mov r6, shiftmp - mova m1, [r4] ; m1 = round - mova m2, [r5] ; m2 = quant - mova m3, [r3] ; m3 = dequant - mova m4, [r6] ; m4 = shift - - mov r3, iscanmp - - DEFINE_ARGS eob, qcoeff, dqcoeff, iscan - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - paddsw m6, m1 ; m6 += round - punpckhqdq m1, m1 - paddsw m11, m1 ; m11 += round - pmulhw m8, m6, m2 ; m8 = m6*q>>16 - punpckhqdq m2, m2 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m8, m6 ; m8 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m8, m4 ; m8 = m8*qsh>>16 - punpckhqdq m4, m4 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m8, m9 ; m8 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m8, m7 - pand m13, m12 - -%if CONFIG_VP9_HIGHBITDEPTH - ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m8 - punpckhwd m6, m8, m6 - pmovsxwd m11, m8 - mova [qcoeffq ], m11 - mova [qcoeffq+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [qcoeffq+32], m11 - mova [qcoeffq+48], m6 -%else - mova [qcoeffq ], m8 - mova [qcoeffq+16], m13 -%endif - - pmullw m8, m3 ; dqc[i] = qc[i] * q - punpckhqdq m3, m3 - pmullw m13, m3 ; dqc[i] = qc[i] * q - -%if CONFIG_VP9_HIGHBITDEPTH - ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m8 - punpckhwd m6, m8, m6 - pmovsxwd m11, m8 - mova [dqcoeffq ], m11 - mova [dqcoeffq+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [dqcoeffq+32], m11 - mova [dqcoeffq+48], m6 -%else - mova [dqcoeffq ], m8 - mova [dqcoeffq+16], m13 -%endif - - mova m6, [iscanq] ; m6 = scan[i] - mova m11, [iscanq+16] ; m11 = scan[i] - - pcmpeqw m8, m8, m5 ; m8 = c[i] == 0 - pcmpeqw m13, m13, m5 ; m13 = c[i] == 0 - psubw m6, m6, m7 ; m6 = scan[i] + 1 - psubw m11, m11, m12 ; m11 = scan[i] + 1 - pandn m8, m8, m6 ; m8 = max(eob) - pandn m13, m13, m11 ; m13 = max(eob) - pmaxsw m8, m8, m13 - - ; Horizontally accumulate/max eobs and write into [eob] memory pointer - pshufd m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0x1 - pmaxsw m8, m7 - movq rax, m8 - mov [eobq], ax - - vzeroupper - RET - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Generic case of ncoeff != 16 - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -.generic: - -%endif ; %ifnidn %1, b_32x32 - -DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ - qcoeff, dqcoeff, dequant, eob, scan, iscan - - ; Actual quantization loop - setup pointers, rounders, etc. - movifnidn coeffq, coeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, dequantmp - movifnidn zbinq, zbinmp - movifnidn roundq, roundmp - movifnidn quantq, quantmp - mova m0, [zbinq] ; m0 = zbin - mova m1, [roundq] ; m1 = round - mova m2, [quantq] ; m2 = quant - mova m3, [r2] ; m3 = dequant - pcmpeqw m4, m4 ; All lanes -1 -%ifidn %1, b_32x32 - psubw m0, m4 - psubw m1, m4 - psrlw m0, 1 ; m0 = (m0 + 1) / 2 - psrlw m1, 1 ; m1 = (m1 + 1) / 2 -%endif - paddw m0, m4 ; m0 = m0 + 1 - - mov r2, shiftmp - mov r3, qcoeffmp - mova m4, [r2] ; m4 = shift - mov r4, dqcoeffmp - mov r5, iscanmp -%ifidn %1, b_32x32 - psllw m4, 1 -%endif - pxor m5, m5 ; m5 = dedicated zero - - DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob - -%if CONFIG_VP9_HIGHBITDEPTH - lea coeffq, [ coeffq+ncoeffq*4] - lea qcoeffq, [ qcoeffq+ncoeffq*4] - lea dqcoeffq, [dqcoeffq+ncoeffq*4] -%else - lea coeffq, [ coeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] - lea dqcoeffq, [dqcoeffq+ncoeffq*2] -%endif - lea iscanq, [ iscanq+ncoeffq*2] - neg ncoeffq - - ; get DC and first 15 AC coeffs -%if CONFIG_VP9_HIGHBITDEPTH - ; coeff stored as 32bit numbers & require 16bit numbers - mova m9, [coeffq+ncoeffq*4+ 0] - packssdw m9, [coeffq+ncoeffq*4+16] - mova m10, [coeffq+ncoeffq*4+32] - packssdw m10, [coeffq+ncoeffq*4+48] -%else - mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i] -%endif - - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - punpckhqdq m0, m0 - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin - - ; Check if all coeffs are less than zbin. If yes, skip forward quickly. - por m14, m7, m12 - ptest m14, m14 - jnz .first_nonzero - -%if CONFIG_VP9_HIGHBITDEPTH - mova [qcoeffq+ncoeffq*4 ], ymm5 - mova [qcoeffq+ncoeffq*4+32], ymm5 - mova [dqcoeffq+ncoeffq*4 ], ymm5 - mova [dqcoeffq+ncoeffq*4+32], ymm5 -%else - mova [qcoeffq+ncoeffq*2], ymm5 - mova [dqcoeffq+ncoeffq*2], ymm5 -%endif - - add ncoeffq, mmsize - - punpckhqdq m1, m1 - punpckhqdq m2, m2 - punpckhqdq m3, m3 - punpckhqdq m4, m4 - pxor m8, m8 - - jmp .ac_only_loop - -.first_nonzero: - - paddsw m6, m1 ; m6 += round - punpckhqdq m1, m1 - paddsw m11, m1 ; m11 += round - pmulhw m8, m6, m2 ; m8 = m6*q>>16 - punpckhqdq m2, m2 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m8, m6 ; m8 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m8, m4 ; m8 = m8*qsh>>16 - punpckhqdq m4, m4 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m8, m9 ; m8 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m8, m7 - pand m13, m12 - -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m8 - punpckhwd m6, m8, m6 - pmovsxwd m11, m8 - mova [qcoeffq+ncoeffq*4+ 0], m11 - mova [qcoeffq+ncoeffq*4+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [qcoeffq+ncoeffq*4+32], m11 - mova [qcoeffq+ncoeffq*4+48], m6 -%else - mova [qcoeffq+ncoeffq*2+ 0], m8 - mova [qcoeffq+ncoeffq*2+16], m13 -%endif - -%ifidn %1, b_32x32 - pabsw m8, m8 - pabsw m13, m13 -%endif - pmullw m8, m3 ; dqc[i] = qc[i] * q - punpckhqdq m3, m3 - pmullw m13, m3 ; dqc[i] = qc[i] * q -%ifidn %1, b_32x32 - psrlw m8, 1 - psrlw m13, 1 - psignw m8, m9 - psignw m13, m10 -%endif - -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m8 - punpckhwd m6, m8, m6 - pmovsxwd m11, m8 - mova [dqcoeffq+ncoeffq*4+ 0], m11 - mova [dqcoeffq+ncoeffq*4+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [dqcoeffq+ncoeffq*4+32], m11 - mova [dqcoeffq+ncoeffq*4+48], m6 -%else - mova [dqcoeffq+ncoeffq*2+ 0], m8 - mova [dqcoeffq+ncoeffq*2+16], m13 -%endif - - pcmpeqw m8, m5 ; m8 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [iscanq+ncoeffq*2] ; m6 = scan[i] - mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m12 ; m11 = scan[i] + 1 - pandn m8, m6 ; m8 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m13 - add ncoeffq, mmsize - -.ac_only_loop: - -%if CONFIG_VP9_HIGHBITDEPTH - ; pack coeff from 32bit to 16bit array - mova m9, [coeffq+ncoeffq*4+ 0] - packssdw m9, [coeffq+ncoeffq*4+16] - mova m10, [coeffq+ncoeffq*4+32] - packssdw m10, [coeffq+ncoeffq*4+48] -%else - mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i] -%endif - - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin - - ; Check if all coeffs are less than zbin. If yes, skip this itertion. - ; And just write zeros as the result would be. - por m14, m7, m12 - ptest m14, m14 - jnz .rest_nonzero - -%if CONFIG_VP9_HIGHBITDEPTH - mova [qcoeffq+ncoeffq*4+ 0], ymm5 - mova [qcoeffq+ncoeffq*4+32], ymm5 - mova [dqcoeffq+ncoeffq*4+ 0], ymm5 - mova [dqcoeffq+ncoeffq*4+32], ymm5 -%else - mova [qcoeffq+ncoeffq*2+ 0], ymm5 - mova [dqcoeffq+ncoeffq*2+ 0], ymm5 -%endif - add ncoeffq, mmsize - jnz .ac_only_loop - - ; Horizontally accumulate/max eobs and write into [eob] memory pointer - mov r2, eobmp - pshufd m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0x1 - pmaxsw m8, m7 - movq rax, m8 - mov [r2], ax - vzeroupper - RET - -.rest_nonzero: - paddsw m6, m1 ; m6 += round - paddsw m11, m1 ; m11 += round - pmulhw m14, m6, m2 ; m14 = m6*q>>16 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m14, m6 ; m14 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m14, m4 ; m14 = m14*qsh>>16 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m14, m9 ; m14 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m14, m7 - pand m13, m12 - -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m14 - punpckhwd m6, m14, m6 - pmovsxwd m11, m14 - mova [qcoeffq+ncoeffq*4+ 0], m11 - mova [qcoeffq+ncoeffq*4+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [qcoeffq+ncoeffq*4+32], m11 - mova [qcoeffq+ncoeffq*4+48], m6 -%else - mova [qcoeffq+ncoeffq*2+ 0], m14 - mova [qcoeffq+ncoeffq*2+16], m13 -%endif - -%ifidn %1, b_32x32 - pabsw m14, m14 - pabsw m13, m13 -%endif - pmullw m14, m3 ; dqc[i] = qc[i] * q - pmullw m13, m3 ; dqc[i] = qc[i] * q -%ifidn %1, b_32x32 - psrlw m14, 1 - psrlw m13, 1 - psignw m14, m9 - psignw m13, m10 -%endif - -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m14 - punpckhwd m6, m14, m6 - pmovsxwd m11, m14 - mova [dqcoeffq+ncoeffq*4+ 0], m11 - mova [dqcoeffq+ncoeffq*4+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [dqcoeffq+ncoeffq*4+32], m11 - mova [dqcoeffq+ncoeffq*4+48], m6 -%else - mova [dqcoeffq+ncoeffq*2+ 0], m14 - mova [dqcoeffq+ncoeffq*2+16], m13 -%endif - - pcmpeqw m14, m5 ; m14 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m12 ; m11 = scan[i] + 1 - pandn m14, m6 ; m14 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m14 - pmaxsw m8, m13 - add ncoeffq, mmsize - jnz .ac_only_loop - - ; Horizontally accumulate/max eobs and write into [eob] memory pointer - mov r2, eobmp - pshufd m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0x1 - pmaxsw m8, m7 - movq rax, m8 - mov [r2], ax - vzeroupper - RET - - ; Skip-block, i.e. just write all zeroes -.blank: - -DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ - qcoeff, dqcoeff, dequant, eob, scan, iscan - - mov r0, dqcoeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, qcoeffmp - mov r3, eobmp - -DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob - -%if CONFIG_VP9_HIGHBITDEPTH - lea dqcoeffq, [dqcoeffq+ncoeffq*4] - lea qcoeffq, [ qcoeffq+ncoeffq*4] -%else - lea dqcoeffq, [dqcoeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] -%endif - - neg ncoeffq - pxor m7, m7 - -.blank_loop: -%if CONFIG_VP9_HIGHBITDEPTH - mova [dqcoeffq+ncoeffq*4+ 0], ymm7 - mova [dqcoeffq+ncoeffq*4+32], ymm7 - mova [qcoeffq+ncoeffq*4+ 0], ymm7 - mova [qcoeffq+ncoeffq*4+32], ymm7 -%else - mova [dqcoeffq+ncoeffq*2+ 0], ymm7 - mova [qcoeffq+ncoeffq*2+ 0], ymm7 -%endif - add ncoeffq, mmsize - jl .blank_loop - - mov [eobq], word 0 - - vzeroupper - RET -%endmacro - -INIT_XMM avx -QUANTIZE_FN b, 7 -QUANTIZE_FN b_32x32, 7 - -END diff --git a/libvpx/vpx_dsp/x86/quantize_sse2.c b/libvpx/vpx_dsp/x86/quantize_sse2.c index 32721beb3..c020b398c 100644 --- a/libvpx/vpx_dsp/x86/quantize_sse2.c +++ b/libvpx/vpx_dsp/x86/quantize_sse2.c @@ -8,12 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <assert.h> #include <emmintrin.h> #include <xmmintrin.h> #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_x86.h" void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, @@ -22,202 +24,103 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { - __m128i zero; + const __m128i zero = _mm_setzero_si128(); + int index = 16; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob, eob0; + (void)scan_ptr; + (void)skip_block; + assert(!skip_block); + + // Setup global values. + load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, + dequant_ptr, &dequant, quant_shift_ptr, &shift); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); - coeff_ptr += n_coeffs; - iscan_ptr += n_coeffs; - qcoeff_ptr += n_coeffs; - dqcoeff_ptr += n_coeffs; - n_coeffs = -n_coeffs; - zero = _mm_setzero_si128(); - if (!skip_block) { - __m128i eob; - __m128i zbin; - __m128i round, quant, dequant, shift; - { - __m128i coeff0, coeff1; - - // Setup global values - { - __m128i pw_1; - zbin = _mm_load_si128((const __m128i *)zbin_ptr); - round = _mm_load_si128((const __m128i *)round_ptr); - quant = _mm_load_si128((const __m128i *)quant_ptr); - pw_1 = _mm_set1_epi16(1); - zbin = _mm_sub_epi16(zbin, pw_1); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - shift = _mm_load_si128((const __m128i *)quant_shift_ptr); - } - - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - __m128i cmp_mask0, cmp_mask1; - // Do DC and first 15 AC - coeff0 = load_tran_low(coeff_ptr + n_coeffs); - coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); - zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC - cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); - qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); - qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); - shift = _mm_unpackhi_epi64(shift, shift); - qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - // Mask out zbin threshold coeffs - qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); - qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - - store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); - store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); - store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); - } - - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob = _mm_max_epi16(eob, eob1); - } - n_coeffs += 8 * 2; - } - - // AC only loop - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - __m128i cmp_mask0, cmp_mask1; - - coeff0 = load_tran_low(coeff_ptr + n_coeffs); - coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); - cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); - qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); - qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); - qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - // Mask out zbin threshold coeffs - qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); - qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - - store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); - store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); - store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); - } - - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob0 = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob0 = _mm_max_epi16(eob0, eob1); - eob = _mm_max_epi16(eob, eob0); - } - n_coeffs += 8 * 2; - } - - // Accumulate EOB - { - __m128i eob_shuffled; - eob_shuffled = _mm_shuffle_epi32(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); - eob = _mm_max_epi16(eob, eob_shuffled); - *eob_ptr = _mm_extract_epi16(eob, 1); - } - } else { - do { - store_tran_low(zero, dqcoeff_ptr + n_coeffs); - store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8); - store_tran_low(zero, qcoeff_ptr + n_coeffs); - store_tran_low(zero, qcoeff_ptr + n_coeffs + 8); - n_coeffs += 8 * 2; - } while (n_coeffs < 0); - *eob_ptr = 0; + store_tran_low(coeff0, dqcoeff_ptr); + store_tran_low(coeff1, dqcoeff_ptr + 8); + + eob = + scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_tran_low(coeff0, dqcoeff_ptr + index); + store_tran_low(coeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, + index, zero); + eob = _mm_max_epi16(eob, eob0); + + index += 16; } + + *eob_ptr = accumulate_eob(eob); } diff --git a/libvpx/vpx_dsp/x86/quantize_ssse3.c b/libvpx/vpx_dsp/x86/quantize_ssse3.c new file mode 100644 index 000000000..3f528e1a9 --- /dev/null +++ b/libvpx/vpx_dsp/x86/quantize_ssse3.c @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <tmmintrin.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_x86.h" + +void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + const __m128i zero = _mm_setzero_si128(); + int index = 16; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob, eob0; + + (void)scan_ptr; + (void)skip_block; + assert(!skip_block); + + load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, + dequant_ptr, &dequant, quant_shift_ptr, &shift); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_tran_low(coeff0, dqcoeff_ptr); + store_tran_low(coeff1, dqcoeff_ptr + 8); + + eob = + scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_tran_low(coeff0, dqcoeff_ptr + index); + store_tran_low(coeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, + index, zero); + eob = _mm_max_epi16(eob, eob0); + + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} + +void vpx_quantize_b_32x32_ssse3( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + int index; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i eob = zero, eob0; + + (void)scan_ptr; + (void)n_coeffs; + (void)skip_block; + assert(!skip_block); + + // Setup global values. + // The 32x32 halves zbin and round. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + // Shift with rounding. + zbin = _mm_add_epi16(zbin, one); + zbin = _mm_srli_epi16(zbin, 1); + // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so + // it is a strict "greater" comparison. + zbin = _mm_sub_epi16(zbin, one); + + round = _mm_load_si128((const __m128i *)round_ptr); + round = _mm_add_epi16(round, one); + round = _mm_srli_epi16(round, 1); + + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + // I suspect this is not technically OK because quant_shift can be up + // to 1 << 16 and shifting up again will outrange that, but the test is not + // comprehensive enough to catch that and "it's been that way forever" + shift = _mm_slli_epi16(shift, 1); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC. + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); +#endif // CONFIG_HIGHBITDEPTH + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff(&qcoeff0, round, quant, shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs. + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + // Un-sign to bias rounding like C. + // dequant is almost always negative, so this is probably the backwards way + // to handle the sign. However, it matches the previous assembly. + coeff0 = _mm_abs_epi16(qcoeff0); + coeff1 = _mm_abs_epi16(qcoeff1); + + coeff0 = calculate_dqcoeff(coeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = calculate_dqcoeff(coeff1, dequant); + + // "Divide" by 2. + coeff0 = _mm_srli_epi16(coeff0, 1); + coeff1 = _mm_srli_epi16(coeff1, 1); + + coeff0 = _mm_sign_epi16(coeff0, qcoeff0); + coeff1 = _mm_sign_epi16(coeff1, qcoeff1); + + store_tran_low(coeff0, dqcoeff_ptr); + store_tran_low(coeff1, dqcoeff_ptr + 8); + + eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, + zero); + } + + // AC only loop. + for (index = 16; index < 32 * 32; index += 16) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); +#endif // CONFIG_VP9_HIGHBITDEPTH + continue; + } + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + coeff0 = _mm_abs_epi16(qcoeff0); + coeff1 = _mm_abs_epi16(qcoeff1); + + coeff0 = calculate_dqcoeff(coeff0, dequant); + coeff1 = calculate_dqcoeff(coeff1, dequant); + + coeff0 = _mm_srli_epi16(coeff0, 1); + coeff1 = _mm_srli_epi16(coeff1, 1); + + coeff0 = _mm_sign_epi16(coeff0, qcoeff0); + coeff1 = _mm_sign_epi16(coeff1, qcoeff1); + + store_tran_low(coeff0, dqcoeff_ptr + index); + store_tran_low(coeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, + index, zero); + eob = _mm_max_epi16(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm b/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm deleted file mode 100644 index ec2cafb94..000000000 --- a/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm +++ /dev/null @@ -1,345 +0,0 @@ -; -; Copyright (c) 2015 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -pw_1: times 8 dw 1 - -SECTION .text - -; TODO(yunqingwang)fix quantize_b code for skip=1 case. -%macro QUANTIZE_FN 2 -cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ - shift, qcoeff, dqcoeff, dequant, \ - eob, scan, iscan - cmp dword skipm, 0 - jne .blank - - ; actual quantize loop - setup pointers, rounders, etc. - movifnidn coeffq, coeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, dequantmp - movifnidn zbinq, zbinmp - movifnidn roundq, roundmp - movifnidn quantq, quantmp - mova m0, [zbinq] ; m0 = zbin - mova m1, [roundq] ; m1 = round - mova m2, [quantq] ; m2 = quant -%ifidn %1, b_32x32 - pcmpeqw m5, m5 - psrlw m5, 15 - paddw m0, m5 - paddw m1, m5 - psrlw m0, 1 ; m0 = (m0 + 1) / 2 - psrlw m1, 1 ; m1 = (m1 + 1) / 2 -%endif - mova m3, [r2q] ; m3 = dequant - psubw m0, [pw_1] - mov r2, shiftmp - mov r3, qcoeffmp - mova m4, [r2] ; m4 = shift - mov r4, dqcoeffmp - mov r5, iscanmp -%ifidn %1, b_32x32 - psllw m4, 1 -%endif - pxor m5, m5 ; m5 = dedicated zero - DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob -%if CONFIG_VP9_HIGHBITDEPTH - lea coeffq, [ coeffq+ncoeffq*4] - lea qcoeffq, [ qcoeffq+ncoeffq*4] - lea dqcoeffq, [dqcoeffq+ncoeffq*4] -%else - lea coeffq, [ coeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] - lea dqcoeffq, [dqcoeffq+ncoeffq*2] -%endif - lea iscanq, [ iscanq+ncoeffq*2] - neg ncoeffq - - ; get DC and first 15 AC coeffs -%if CONFIG_VP9_HIGHBITDEPTH - ; coeff stored as 32bit numbers & require 16bit numbers - mova m9, [ coeffq+ncoeffq*4+ 0] - packssdw m9, [ coeffq+ncoeffq*4+16] - mova m10, [ coeffq+ncoeffq*4+32] - packssdw m10, [ coeffq+ncoeffq*4+48] -%else - mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] -%endif - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - punpckhqdq m0, m0 - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin - paddsw m6, m1 ; m6 += round - punpckhqdq m1, m1 - paddsw m11, m1 ; m11 += round - pmulhw m8, m6, m2 ; m8 = m6*q>>16 - punpckhqdq m2, m2 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m8, m6 ; m8 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m8, m4 ; m8 = m8*qsh>>16 - punpckhqdq m4, m4 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m8, m9 ; m8 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m8, m7 - pand m13, m12 -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - mova m11, m8 - mova m6, m8 - pcmpgtw m5, m8 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [qcoeffq+ncoeffq*4+ 0], m11 - mova [qcoeffq+ncoeffq*4+16], m6 - pxor m5, m5 - mova m11, m13 - mova m6, m13 - pcmpgtw m5, m13 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [qcoeffq+ncoeffq*4+32], m11 - mova [qcoeffq+ncoeffq*4+48], m6 - pxor m5, m5 ; reset m5 to zero register -%else - mova [qcoeffq+ncoeffq*2+ 0], m8 - mova [qcoeffq+ncoeffq*2+16], m13 -%endif -%ifidn %1, b_32x32 - pabsw m8, m8 - pabsw m13, m13 -%endif - pmullw m8, m3 ; dqc[i] = qc[i] * q - punpckhqdq m3, m3 - pmullw m13, m3 ; dqc[i] = qc[i] * q -%ifidn %1, b_32x32 - psrlw m8, 1 - psrlw m13, 1 - psignw m8, m9 - psignw m13, m10 -%endif -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - mova m11, m8 - mova m6, m8 - pcmpgtw m5, m8 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [dqcoeffq+ncoeffq*4+ 0], m11 - mova [dqcoeffq+ncoeffq*4+16], m6 - pxor m5, m5 - mova m11, m13 - mova m6, m13 - pcmpgtw m5, m13 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [dqcoeffq+ncoeffq*4+32], m11 - mova [dqcoeffq+ncoeffq*4+48], m6 - pxor m5, m5 ; reset m5 to zero register -%else - mova [dqcoeffq+ncoeffq*2+ 0], m8 - mova [dqcoeffq+ncoeffq*2+16], m13 -%endif - pcmpeqw m8, m5 ; m8 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m12 ; m11 = scan[i] + 1 - pandn m8, m6 ; m8 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m13 - add ncoeffq, mmsize - jz .accumulate_eob - -.ac_only_loop: -%if CONFIG_VP9_HIGHBITDEPTH - ; pack coeff from 32bit to 16bit array - mova m9, [ coeffq+ncoeffq*4+ 0] - packssdw m9, [ coeffq+ncoeffq*4+16] - mova m10, [ coeffq+ncoeffq*4+32] - packssdw m10, [ coeffq+ncoeffq*4+48] -%else - mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] -%endif - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin -%ifidn %1, b_32x32 - pmovmskb r6d, m7 - pmovmskb r2d, m12 - or r6, r2 - jz .skip_iter -%endif - paddsw m6, m1 ; m6 += round - paddsw m11, m1 ; m11 += round - pmulhw m14, m6, m2 ; m14 = m6*q>>16 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m14, m6 ; m14 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m14, m4 ; m14 = m14*qsh>>16 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m14, m9 ; m14 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m14, m7 - pand m13, m12 -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - mova m11, m14 - mova m6, m14 - pcmpgtw m5, m14 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [qcoeffq+ncoeffq*4+ 0], m11 - mova [qcoeffq+ncoeffq*4+16], m6 - pxor m5, m5 - mova m11, m13 - mova m6, m13 - pcmpgtw m5, m13 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [qcoeffq+ncoeffq*4+32], m11 - mova [qcoeffq+ncoeffq*4+48], m6 - pxor m5, m5 ; reset m5 to zero register -%else - mova [qcoeffq+ncoeffq*2+ 0], m14 - mova [qcoeffq+ncoeffq*2+16], m13 -%endif -%ifidn %1, b_32x32 - pabsw m14, m14 - pabsw m13, m13 -%endif - pmullw m14, m3 ; dqc[i] = qc[i] * q - pmullw m13, m3 ; dqc[i] = qc[i] * q -%ifidn %1, b_32x32 - psrlw m14, 1 - psrlw m13, 1 - psignw m14, m9 - psignw m13, m10 -%endif -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - mova m11, m14 - mova m6, m14 - pcmpgtw m5, m14 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [dqcoeffq+ncoeffq*4+ 0], m11 - mova [dqcoeffq+ncoeffq*4+16], m6 - pxor m5, m5 - mova m11, m13 - mova m6, m13 - pcmpgtw m5, m13 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [dqcoeffq+ncoeffq*4+32], m11 - mova [dqcoeffq+ncoeffq*4+48], m6 - pxor m5, m5 -%else - mova [dqcoeffq+ncoeffq*2+ 0], m14 - mova [dqcoeffq+ncoeffq*2+16], m13 -%endif - pcmpeqw m14, m5 ; m14 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m12 ; m11 = scan[i] + 1 - pandn m14, m6 ; m14 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m14 - pmaxsw m8, m13 - add ncoeffq, mmsize - jl .ac_only_loop - -%ifidn %1, b_32x32 - jmp .accumulate_eob -.skip_iter: -%if CONFIG_VP9_HIGHBITDEPTH - mova [qcoeffq+ncoeffq*4+ 0], m5 - mova [qcoeffq+ncoeffq*4+16], m5 - mova [qcoeffq+ncoeffq*4+32], m5 - mova [qcoeffq+ncoeffq*4+48], m5 - mova [dqcoeffq+ncoeffq*4+ 0], m5 - mova [dqcoeffq+ncoeffq*4+16], m5 - mova [dqcoeffq+ncoeffq*4+32], m5 - mova [dqcoeffq+ncoeffq*4+48], m5 -%else - mova [qcoeffq+ncoeffq*2+ 0], m5 - mova [qcoeffq+ncoeffq*2+16], m5 - mova [dqcoeffq+ncoeffq*2+ 0], m5 - mova [dqcoeffq+ncoeffq*2+16], m5 -%endif - add ncoeffq, mmsize - jl .ac_only_loop -%endif - -.accumulate_eob: - ; horizontally accumulate/max eobs and write into [eob] memory pointer - mov r2, eobmp - pshufd m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0x1 - pmaxsw m8, m7 - pextrw r6, m8, 0 - mov [r2], r6 - RET - - ; skip-block, i.e. just write all zeroes -.blank: - mov r0, dqcoeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, qcoeffmp - mov r3, eobmp - DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob -%if CONFIG_VP9_HIGHBITDEPTH - lea dqcoeffq, [dqcoeffq+ncoeffq*4] - lea qcoeffq, [ qcoeffq+ncoeffq*4] -%else - lea dqcoeffq, [dqcoeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] -%endif - neg ncoeffq - pxor m7, m7 -.blank_loop: -%if CONFIG_VP9_HIGHBITDEPTH - mova [dqcoeffq+ncoeffq*4+ 0], m7 - mova [dqcoeffq+ncoeffq*4+16], m7 - mova [dqcoeffq+ncoeffq*4+32], m7 - mova [dqcoeffq+ncoeffq*4+48], m7 - mova [qcoeffq+ncoeffq*4+ 0], m7 - mova [qcoeffq+ncoeffq*4+16], m7 - mova [qcoeffq+ncoeffq*4+32], m7 - mova [qcoeffq+ncoeffq*4+48], m7 -%else - mova [dqcoeffq+ncoeffq*2+ 0], m7 - mova [dqcoeffq+ncoeffq*2+16], m7 - mova [qcoeffq+ncoeffq*2+ 0], m7 - mova [qcoeffq+ncoeffq*2+16], m7 -%endif - add ncoeffq, mmsize - jl .blank_loop - mov word [eobq], 0 - RET -%endmacro - -INIT_XMM ssse3 -QUANTIZE_FN b, 7 -QUANTIZE_FN b_32x32, 7 diff --git a/libvpx/vpx_dsp/x86/quantize_x86.h b/libvpx/vpx_dsp/x86/quantize_x86.h new file mode 100644 index 000000000..34928fbb5 --- /dev/null +++ b/libvpx/vpx_dsp/x86/quantize_x86.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <emmintrin.h> + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" + +static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, + const int16_t *round_ptr, __m128i *round, + const int16_t *quant_ptr, __m128i *quant, + const int16_t *dequant_ptr, __m128i *dequant, + const int16_t *shift_ptr, __m128i *shift) { + *zbin = _mm_load_si128((const __m128i *)zbin_ptr); + *round = _mm_load_si128((const __m128i *)round_ptr); + *quant = _mm_load_si128((const __m128i *)quant_ptr); + *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1)); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); + *shift = _mm_load_si128((const __m128i *)shift_ptr); +} + +// With ssse3 and later abs() and sign() are preferred. +static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) { + a = _mm_xor_si128(a, sign); + return _mm_sub_epi16(a, sign); +} + +static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round, + const __m128i quant, const __m128i shift) { + __m128i tmp, qcoeff; + qcoeff = _mm_adds_epi16(*coeff, round); + tmp = _mm_mulhi_epi16(qcoeff, quant); + qcoeff = _mm_add_epi16(tmp, qcoeff); + *coeff = _mm_mulhi_epi16(qcoeff, shift); +} + +static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) { + return _mm_mullo_epi16(qcoeff, dequant); +} + +// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing +// to zbin to add 1 to the index in 'scan'. +static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, + const __m128i zbin_mask0, + const __m128i zbin_mask1, + const int16_t *scan_ptr, const int index, + const __m128i zero) { + const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero); + const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero); + __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index)); + __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8)); + __m128i eob0, eob1; + // Add one to convert from indices to counts + scan0 = _mm_sub_epi16(scan0, zbin_mask0); + scan1 = _mm_sub_epi16(scan1, zbin_mask1); + eob0 = _mm_andnot_si128(zero_coeff0, scan0); + eob1 = _mm_andnot_si128(zero_coeff1, scan1); + return _mm_max_epi16(eob0, eob1); +} + +static INLINE int16_t accumulate_eob(__m128i eob) { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + return _mm_extract_epi16(eob, 1); +} diff --git a/libvpx/vpx_dsp/x86/sad4d_avx512.c b/libvpx/vpx_dsp/x86/sad4d_avx512.c new file mode 100644 index 000000000..5f2ab6ea7 --- /dev/null +++ b/libvpx/vpx_dsp/x86/sad4d_avx512.c @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <immintrin.h> // AVX512 +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; + __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + __m512i sum_mlow, sum_mhigh; + int i; + const uint8_t *ref0, *ref1, *ref2, *ref3; + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + sum_ref0 = _mm512_set1_epi16(0); + sum_ref1 = _mm512_set1_epi16(0); + sum_ref2 = _mm512_set1_epi16(0); + sum_ref3 = _mm512_set1_epi16(0); + for (i = 0; i < 64; i++) { + // load src and all refs + src_reg = _mm512_loadu_si512((const __m512i *)src); + ref0_reg = _mm512_loadu_si512((const __m512i *)ref0); + ref1_reg = _mm512_loadu_si512((const __m512i *)ref1); + ref2_reg = _mm512_loadu_si512((const __m512i *)ref2); + ref3_reg = _mm512_loadu_si512((const __m512i *)ref3); + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm512_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm512_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm512_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm512_sad_epu8(ref3_reg, src_reg); + // sum every ref-i + sum_ref0 = _mm512_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm512_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm512_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm512_add_epi32(sum_ref3, ref3_reg); + + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + { + __m256i sum256; + __m128i sum128; + // in sum_ref-i the result is saved in the first 4 bytes + // the other 4 bytes are zeroed. + // sum_ref1 and sum_ref3 are shifted left by 4 bytes + sum_ref1 = _mm512_bslli_epi128(sum_ref1, 4); + sum_ref3 = _mm512_bslli_epi128(sum_ref3, 4); + + // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 + sum_ref0 = _mm512_or_si512(sum_ref0, sum_ref1); + sum_ref2 = _mm512_or_si512(sum_ref2, sum_ref3); + + // merge every 64 bit from each sum_ref-i + sum_mlow = _mm512_unpacklo_epi64(sum_ref0, sum_ref2); + sum_mhigh = _mm512_unpackhi_epi64(sum_ref0, sum_ref2); + + // add the low 64 bit to the high 64 bit + sum_mlow = _mm512_add_epi32(sum_mlow, sum_mhigh); + + // add the low 128 bit to the high 128 bit + sum256 = _mm256_add_epi32(_mm512_castsi512_si256(sum_mlow), + _mm512_extracti32x8_epi32(sum_mlow, 1)); + sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum256), + _mm256_extractf128_si256(sum256, 1)); + + _mm_storeu_si128((__m128i *)(res), sum128); + } +} diff --git a/libvpx/vpx_dsp/x86/sad_sse3.asm b/libvpx/vpx_dsp/x86/sad_sse3.asm index 18279bdb9..175dcc089 100644 --- a/libvpx/vpx_dsp/x86/sad_sse3.asm +++ b/libvpx/vpx_dsp/x86/sad_sse3.asm @@ -165,6 +165,8 @@ paddw mm7, mm3 %endmacro +SECTION .text + ;void int vpx_sad16x16x3_sse3( ; unsigned char *src_ptr, ; int src_stride, diff --git a/libvpx/vpx_dsp/x86/sad_sse4.asm b/libvpx/vpx_dsp/x86/sad_sse4.asm index bc6744797..03999dfca 100644 --- a/libvpx/vpx_dsp/x86/sad_sse4.asm +++ b/libvpx/vpx_dsp/x86/sad_sse4.asm @@ -165,6 +165,8 @@ movdqa [rdi + 16], xmm2 %endmacro +SECTION .text + ;void vpx_sad16x16x8_sse4_1( ; const unsigned char *src_ptr, ; int src_stride, diff --git a/libvpx/vpx_dsp/x86/sad_ssse3.asm b/libvpx/vpx_dsp/x86/sad_ssse3.asm index 49f204fa0..7cf93cf51 100644 --- a/libvpx/vpx_dsp/x86/sad_ssse3.asm +++ b/libvpx/vpx_dsp/x86/sad_ssse3.asm @@ -146,6 +146,8 @@ %endmacro +SECTION .text + ;void int vpx_sad16x16x3_ssse3( ; unsigned char *src_ptr, ; int src_stride, diff --git a/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm b/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm index 6d58321e0..300fa8aab 100644 --- a/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm +++ b/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm @@ -44,6 +44,9 @@ paddd %1, xmm1 SUM_ACROSS_Q %1 %endmacro + +SECTION .text + ;void ssim_parms_sse2( ; unsigned char *s, ; int sp, diff --git a/libvpx/vpx_dsp/x86/transpose_sse2.h b/libvpx/vpx_dsp/x86/transpose_sse2.h index a5e40245a..8a0119ca7 100644 --- a/libvpx/vpx_dsp/x86/transpose_sse2.h +++ b/libvpx/vpx_dsp/x86/transpose_sse2.h @@ -11,45 +11,357 @@ #ifndef VPX_DSP_X86_TRANSPOSE_SSE2_H_ #define VPX_DSP_X86_TRANSPOSE_SSE2_H_ -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/x86/inv_txfm_sse2.h" -#include "vpx_dsp/x86/txfm_common_sse2.h" +#include <emmintrin.h> // SSE2 -static INLINE void transpose_16bit_4x4(__m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); - const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); +#include "./vpx_config.h" - res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); - res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); +static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + return _mm_unpacklo_epi16(a0, a1); +} + +static INLINE void transpose_8bit_8x8(const __m128i *const in, + __m128i *const out) { + // Unpack 8 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]); + + // Unpack 16 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + const __m128i b0 = _mm_unpacklo_epi16(a0, a1); + const __m128i b1 = _mm_unpackhi_epi16(a0, a1); + const __m128i b2 = _mm_unpacklo_epi16(a2, a3); + const __m128i b3 = _mm_unpackhi_epi16(a2, a3); + + // Unpack 32 bit elements resulting in: + // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + const __m128i c0 = _mm_unpacklo_epi32(b0, b2); + const __m128i c1 = _mm_unpackhi_epi32(b0, b2); + const __m128i c2 = _mm_unpacklo_epi32(b1, b3); + const __m128i c3 = _mm_unpackhi_epi32(b1, b3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + // out[4]: 04 14 24 34 44 54 64 74 + // out[5]: 05 15 25 35 45 55 65 75 + // out[6]: 06 16 26 36 46 56 66 76 + // out[7]: 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi64(c0, c0); + out[1] = _mm_unpackhi_epi64(c0, c0); + out[2] = _mm_unpacklo_epi64(c1, c1); + out[3] = _mm_unpackhi_epi64(c1, c1); + out[4] = _mm_unpacklo_epi64(c2, c2); + out[5] = _mm_unpackhi_epi64(c2, c2); + out[6] = _mm_unpacklo_epi64(c3, c3); + out[7] = _mm_unpackhi_epi64(c3, c3); +} + +static INLINE void transpose_16bit_4x4(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // out[0]: 00 10 20 30 01 11 21 31 + // out[1]: 02 12 22 32 03 13 23 33 + out[0] = _mm_unpacklo_epi32(a0, a1); + out[1] = _mm_unpackhi_epi32(a0, a1); +} + +static INLINE void transpose_16bit_4x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // in[4]: 40 41 42 43 XX XX XX XX + // in[5]: 50 51 52 53 XX XX XX XX + // in[6]: 60 61 62 63 XX XX XX XX + // in[7]: 70 71 72 73 XX XX XX XX + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a2: 40 50 41 51 42 52 43 53 + // a3: 60 70 61 71 62 72 63 73 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b1: 40 50 60 70 41 51 61 71 + // b2: 02 12 22 32 03 13 23 33 + // b3: 42 52 62 72 43 53 63 73 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b1 = _mm_unpacklo_epi32(a2, a3); + const __m128i b2 = _mm_unpackhi_epi32(a0, a1); + const __m128i b3 = _mm_unpackhi_epi32(a2, a3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + out[0] = _mm_unpacklo_epi64(b0, b1); + out[1] = _mm_unpackhi_epi64(b0, b1); + out[2] = _mm_unpacklo_epi64(b2, b3); + out[3] = _mm_unpackhi_epi64(b2, b3); +} + +static INLINE void transpose_16bit_8x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a2: 40 50 41 51 42 52 43 53 + // a3: 60 70 61 71 62 72 63 73 + // a4: 04 14 05 15 06 16 07 17 + // a5: 24 34 25 35 26 36 27 37 + // a6: 44 54 45 55 46 56 47 57 + // a7: 64 74 65 75 66 76 67 77 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b1: 40 50 60 70 41 51 61 71 + // b2: 04 14 24 34 05 15 25 35 + // b3: 44 54 64 74 45 55 65 75 + // b4: 02 12 22 32 03 13 23 33 + // b5: 42 52 62 72 43 53 63 73 + // b6: 06 16 26 36 07 17 27 37 + // b7: 46 56 66 76 47 57 67 77 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b1 = _mm_unpacklo_epi32(a2, a3); + const __m128i b2 = _mm_unpacklo_epi32(a4, a5); + const __m128i b3 = _mm_unpacklo_epi32(a6, a7); + const __m128i b4 = _mm_unpackhi_epi32(a0, a1); + const __m128i b5 = _mm_unpackhi_epi32(a2, a3); + const __m128i b6 = _mm_unpackhi_epi32(a4, a5); + const __m128i b7 = _mm_unpackhi_epi32(a6, a7); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + // out[4]: 04 14 24 34 44 54 64 74 + // out[5]: 05 15 25 35 45 55 65 75 + // out[6]: 06 16 26 36 46 56 66 76 + // out[7]: 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi64(b0, b1); + out[1] = _mm_unpackhi_epi64(b0, b1); + out[2] = _mm_unpacklo_epi64(b4, b5); + out[3] = _mm_unpackhi_epi64(b4, b5); + out[4] = _mm_unpacklo_epi64(b2, b3); + out[5] = _mm_unpackhi_epi64(b2, b3); + out[6] = _mm_unpacklo_epi64(b6, b7); + out[7] = _mm_unpackhi_epi64(b6, b7); +} + +// Transpose in-place +static INLINE void transpose_16bit_16x16(__m128i *const left, + __m128i *const right) { + __m128i tbuf[8]; + transpose_16bit_8x8(left, left); + transpose_16bit_8x8(right, tbuf); + transpose_16bit_8x8(left + 8, right); + transpose_16bit_8x8(right + 8, right + 8); + + left[8] = tbuf[0]; + left[9] = tbuf[1]; + left[10] = tbuf[2]; + left[11] = tbuf[3]; + left[12] = tbuf[4]; + left[13] = tbuf[5]; + left[14] = tbuf[6]; + left[15] = tbuf[7]; +} + +static INLINE void transpose_32bit_4x4(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); + const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); } -static INLINE void transpose_32bit_4x4(__m128i *const a0, __m128i *const a1, - __m128i *const a2, __m128i *const a3) { +static INLINE void transpose_32bit_4x4x2(const __m128i *const in, + __m128i *const out) { // Unpack 32 bit elements. Goes from: - // a0: 00 01 02 03 - // a1: 10 11 12 13 - // a2: 20 21 22 23 - // a3: 30 31 32 33 + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // in[4]: 04 05 06 07 + // in[5]: 14 15 16 17 + // in[6]: 24 25 26 27 + // in[7]: 34 35 36 37 // to: - // b0: 00 10 01 11 - // b1: 20 30 21 31 - // b2: 02 12 03 13 - // b3: 22 32 23 33 + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + // a4: 04 14 05 15 + // a5: 24 34 25 35 + // a6: 06 16 07 17 + // a7: 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); + const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); + const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]); + const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]); + const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]); + const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]); - const __m128i b0 = _mm_unpacklo_epi32(*a0, *a1); - const __m128i b1 = _mm_unpacklo_epi32(*a2, *a3); - const __m128i b2 = _mm_unpackhi_epi32(*a0, *a1); - const __m128i b3 = _mm_unpackhi_epi32(*a2, *a3); + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + // out[4]: 04 14 24 34 + // out[5]: 05 15 25 35 + // out[6]: 06 16 26 36 + // out[7]: 07 17 27 37 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); + out[4] = _mm_unpacklo_epi64(a4, a5); + out[5] = _mm_unpackhi_epi64(a4, a5); + out[6] = _mm_unpacklo_epi64(a6, a7); + out[7] = _mm_unpackhi_epi64(a6, a7); +} + +static INLINE void transpose_32bit_8x4(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 04 05 06 07 + // in[2]: 10 11 12 13 + // in[3]: 14 15 16 17 + // in[4]: 20 21 22 23 + // in[5]: 24 25 26 27 + // in[6]: 30 31 32 33 + // in[7]: 34 35 36 37 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + // a4: 04 14 05 15 + // a5: 24 34 25 35 + // a6: 06 16 07 17 + // a7: 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]); + const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]); + const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]); + const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]); + const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]); + const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]); + const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]); // Unpack 64 bit elements resulting in: - // a0: 00 10 20 30 - // a1: 01 11 21 31 - // a2: 02 12 22 32 - // a3: 03 13 23 33 - *a0 = _mm_unpacklo_epi64(b0, b1); - *a1 = _mm_unpackhi_epi64(b0, b1); - *a2 = _mm_unpacklo_epi64(b2, b3); - *a3 = _mm_unpackhi_epi64(b2, b3); + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + // out[4]: 04 14 24 34 + // out[5]: 05 15 25 35 + // out[6]: 06 16 26 36 + // out[7]: 07 17 27 37 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); + out[4] = _mm_unpacklo_epi64(a4, a5); + out[5] = _mm_unpackhi_epi64(a4, a5); + out[6] = _mm_unpacklo_epi64(a6, a7); + out[7] = _mm_unpackhi_epi64(a6, a7); } #endif // VPX_DSP_X86_TRANSPOSE_SSE2_H_ diff --git a/libvpx/vpx_dsp/x86/txfm_common_sse2.h b/libvpx/vpx_dsp/x86/txfm_common_sse2.h index f8edb1b78..0a9542c85 100644 --- a/libvpx/vpx_dsp/x86/txfm_common_sse2.h +++ b/libvpx/vpx_dsp/x86/txfm_common_sse2.h @@ -18,6 +18,9 @@ _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) +#define pair_set_epi32(a, b) \ + _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a)) + #define dual_set_epi16(a, b) \ _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \ (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a)) diff --git a/libvpx/vpx_dsp/x86/variance_avx2.c b/libvpx/vpx_dsp/x86/variance_avx2.c index 8428e0520..d15a89c74 100644 --- a/libvpx/vpx_dsp/x86/variance_avx2.c +++ b/libvpx/vpx_dsp/x86/variance_avx2.c @@ -7,16 +7,592 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ + +#include <immintrin.h> // AVX2 + #include "./vpx_dsp_rtcd.h" +/* clang-format off */ +DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, +}; + +DECLARE_ALIGNED(32, static const int8_t, adjacent_sub_avx2[32]) = { + 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, + 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1 +}; +/* clang-format on */ + +void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride, + const unsigned char *ref_ptr, int recon_stride, + unsigned int *sse, int *sum) { + unsigned int i, src_2strides, ref_2strides; + __m256i sum_reg = _mm256_setzero_si256(); + __m256i sse_reg = _mm256_setzero_si256(); + // process two 16 byte locations in a 256 bit register + src_2strides = source_stride << 1; + ref_2strides = recon_stride << 1; + for (i = 0; i < 8; ++i) { + // convert up values in 128 bit registers across lanes + const __m256i src0 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const *)(src_ptr))); + const __m256i src1 = _mm256_cvtepu8_epi16( + _mm_loadu_si128((__m128i const *)(src_ptr + source_stride))); + const __m256i ref0 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const *)(ref_ptr))); + const __m256i ref1 = _mm256_cvtepu8_epi16( + _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride))); + const __m256i diff0 = _mm256_sub_epi16(src0, ref0); + const __m256i diff1 = _mm256_sub_epi16(src1, ref1); + const __m256i madd0 = _mm256_madd_epi16(diff0, diff0); + const __m256i madd1 = _mm256_madd_epi16(diff1, diff1); + + // add to the running totals + sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff0, diff1)); + sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd0, madd1)); + + src_ptr += src_2strides; + ref_ptr += ref_2strides; + } + { + // extract the low lane and add it to the high lane + const __m128i sum_reg_128 = _mm_add_epi16( + _mm256_castsi256_si128(sum_reg), _mm256_extractf128_si256(sum_reg, 1)); + const __m128i sse_reg_128 = _mm_add_epi32( + _mm256_castsi256_si128(sse_reg), _mm256_extractf128_si256(sse_reg, 1)); + + // sum upper and lower 64 bits together and convert up to 32 bit values + const __m128i sum_reg_64 = + _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8)); + const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64); + + // unpack sse and sum registers and add + const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, sum_int32); + const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, sum_int32); + const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi); + + // perform the final summation and extract the results + const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8)); + *((int *)sse) = _mm_cvtsi128_si32(res); + *((int *)sum) = _mm_extract_epi32(res, 1); + } +} + +static void get32x16var_avx2(const unsigned char *src_ptr, int source_stride, + const unsigned char *ref_ptr, int recon_stride, + unsigned int *sse, int *sum) { + unsigned int i, src_2strides, ref_2strides; + const __m256i adj_sub = _mm256_load_si256((__m256i const *)adjacent_sub_avx2); + __m256i sum_reg = _mm256_setzero_si256(); + __m256i sse_reg = _mm256_setzero_si256(); + + // process 64 elements in an iteration + src_2strides = source_stride << 1; + ref_2strides = recon_stride << 1; + for (i = 0; i < 8; i++) { + const __m256i src0 = _mm256_loadu_si256((__m256i const *)(src_ptr)); + const __m256i src1 = + _mm256_loadu_si256((__m256i const *)(src_ptr + source_stride)); + const __m256i ref0 = _mm256_loadu_si256((__m256i const *)(ref_ptr)); + const __m256i ref1 = + _mm256_loadu_si256((__m256i const *)(ref_ptr + recon_stride)); + + // unpack into pairs of source and reference values + const __m256i src_ref0 = _mm256_unpacklo_epi8(src0, ref0); + const __m256i src_ref1 = _mm256_unpackhi_epi8(src0, ref0); + const __m256i src_ref2 = _mm256_unpacklo_epi8(src1, ref1); + const __m256i src_ref3 = _mm256_unpackhi_epi8(src1, ref1); + + // subtract adjacent elements using src*1 + ref*-1 + const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub); + const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub); + const __m256i diff2 = _mm256_maddubs_epi16(src_ref2, adj_sub); + const __m256i diff3 = _mm256_maddubs_epi16(src_ref3, adj_sub); + const __m256i madd0 = _mm256_madd_epi16(diff0, diff0); + const __m256i madd1 = _mm256_madd_epi16(diff1, diff1); + const __m256i madd2 = _mm256_madd_epi16(diff2, diff2); + const __m256i madd3 = _mm256_madd_epi16(diff3, diff3); + + // add to the running totals + sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff0, diff1)); + sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff2, diff3)); + sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd0, madd1)); + sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd2, madd3)); + + src_ptr += src_2strides; + ref_ptr += ref_2strides; + } + + { + // extract the low lane and add it to the high lane + const __m128i sum_reg_128 = _mm_add_epi16( + _mm256_castsi256_si128(sum_reg), _mm256_extractf128_si256(sum_reg, 1)); + const __m128i sse_reg_128 = _mm_add_epi32( + _mm256_castsi256_si128(sse_reg), _mm256_extractf128_si256(sse_reg, 1)); + + // sum upper and lower 64 bits together and convert up to 32 bit values + const __m128i sum_reg_64 = + _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8)); + const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64); + + // unpack sse and sum registers and add + const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, sum_int32); + const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, sum_int32); + const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi); + + // perform the final summation and extract the results + const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8)); + *((int *)sse) = _mm_cvtsi128_si32(res); + *((int *)sum) = _mm_extract_epi32(res, 1); + } +} + +#define FILTER_SRC(filter) \ + /* filter the source */ \ + exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ + exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ + \ + /* add 8 to source */ \ + exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ + exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ + \ + /* divide source by 16 */ \ + exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ + exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); + +#define CALC_SUM_SSE_INSIDE_LOOP \ + /* expand each byte to 2 bytes */ \ + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ + /* source - dest */ \ + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ + /* caculate sum */ \ + *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_lo); \ + exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ + *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_hi); \ + exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ + /* calculate sse */ \ + *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_lo); \ + *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_hi); + +// final calculation to sum and sse +#define CALC_SUM_AND_SSE \ + res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ + sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ + sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ + \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ + \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); + +static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, int do_sec, + int height, __m256i *sum_reg, __m256i *sse_reg) { + const __m256i zero_reg = _mm256_setzero_si256(); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + int i; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_reg = _mm256_loadu_si256((__m256i const *)src); + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i avg_reg = _mm256_avg_epu8(src_reg, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + sec += sec_stride; + } else { + exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg); + } + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } +} + +// (x == 0, y == 4) or (x == 4, y == 0). sstep determines the direction. +static INLINE void spv32_half_zero(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int sstep) { + const __m256i zero_reg = _mm256_setzero_si256(); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + int i; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep)); + const __m256i src_avg = _mm256_avg_epu8(src_0, src_1); + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i avg_reg = _mm256_avg_epu8(src_avg, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + sec += sec_stride; + } else { + exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg); + } + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } +} + +static INLINE void spv32_x0_y4(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, int do_sec, + int height, __m256i *sum_reg, __m256i *sse_reg) { + spv32_half_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, sum_reg, sse_reg, src_stride); +} + +static INLINE void spv32_x4_y0(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, int do_sec, + int height, __m256i *sum_reg, __m256i *sse_reg) { + spv32_half_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, sum_reg, sse_reg, 1); +} + +static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, int do_sec, + int height, __m256i *sum_reg, __m256i *sse_reg) { + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); + __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + int i; + src += src_stride; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)(src)); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1)); + const __m256i src_avg = _mm256_avg_epu8(src_0, src_1); + const __m256i current_avg = _mm256_avg_epu8(prev_src_avg, src_avg); + prev_src_avg = src_avg; + + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i avg_reg = _mm256_avg_epu8(current_avg, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + sec += sec_stride; + } else { + exp_src_lo = _mm256_unpacklo_epi8(current_avg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(current_avg, zero_reg); + } + // save current source average + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + src += src_stride; + } +} + +// (x == 0, y == bil) or (x == 4, y == bil). sstep determines the direction. +static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int offset, int sstep) { + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i pw8 = _mm256_set1_epi16(8); + const __m256i filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + (offset << 5))); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + int i; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep)); + exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1); + exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1); + + FILTER_SRC(filter) + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg); + sec += sec_stride; + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + } + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } +} + +static INLINE void spv32_x0_yb(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, int do_sec, + int height, __m256i *sum_reg, __m256i *sse_reg, + int y_offset) { + spv32_bilin_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, sum_reg, sse_reg, y_offset, src_stride); +} + +static INLINE void spv32_xb_y0(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, int do_sec, + int height, __m256i *sum_reg, __m256i *sse_reg, + int x_offset) { + spv32_bilin_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, sum_reg, sse_reg, x_offset, 1); +} + +static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, int do_sec, + int height, __m256i *sum_reg, __m256i *sse_reg, + int y_offset) { + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i pw8 = _mm256_set1_epi16(8); + const __m256i filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5))); + const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); + __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + int i; + src += src_stride; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1)); + const __m256i src_avg = _mm256_avg_epu8(src_0, src_1); + exp_src_lo = _mm256_unpacklo_epi8(prev_src_avg, src_avg); + exp_src_hi = _mm256_unpackhi_epi8(prev_src_avg, src_avg); + prev_src_avg = src_avg; + + FILTER_SRC(filter) + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i exp_src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + const __m256i avg_reg = _mm256_avg_epu8(exp_src_avg, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + sec += sec_stride; + } + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + src += src_stride; + } +} + +static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, int do_sec, + int height, __m256i *sum_reg, __m256i *sse_reg, + int x_offset) { + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i pw8 = _mm256_set1_epi16(8); + const __m256i filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5))); + const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i src_reg, src_pack; + int i; + exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b); + exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b); + FILTER_SRC(filter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + + src += src_stride; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1)); + exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1); + exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1); + + FILTER_SRC(filter) + + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_reg); + + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i avg_pack = _mm256_avg_epu8(src_pack, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_pack, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_pack, zero_reg); + sec += sec_stride; + } else { + exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg); + } + CALC_SUM_SSE_INSIDE_LOOP + src_pack = src_reg; + dst += dst_stride; + src += src_stride; + } +} + +static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, int do_sec, + int height, __m256i *sum_reg, __m256i *sse_reg, + int x_offset, int y_offset) { + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i pw8 = _mm256_set1_epi16(8); + const __m256i xfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5))); + const __m256i yfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5))); + const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i prev_src_pack, src_pack; + int i; + exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b); + exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b); + FILTER_SRC(xfilter) + // convert each 16 bit to 8 bit to each low and high lane source + prev_src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + src += src_stride; + + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1)); + exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1); + exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1); + + FILTER_SRC(xfilter) + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + + // merge previous pack to current pack source + exp_src_lo = _mm256_unpacklo_epi8(prev_src_pack, src_pack); + exp_src_hi = _mm256_unpackhi_epi8(prev_src_pack, src_pack); + + FILTER_SRC(yfilter) + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + sec += sec_stride; + } + + prev_src_pack = src_pack; + + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + src += src_stride; + } +} + +static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, + int do_sec, int height, unsigned int *sse) { + const __m256i zero_reg = _mm256_setzero_si256(); + __m256i sum_reg = _mm256_setzero_si256(); + __m256i sse_reg = _mm256_setzero_si256(); + __m256i sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; + int sum; + // x_offset = 0 and y_offset = 0 + if (x_offset == 0) { + if (y_offset == 0) { + spv32_x0_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, &sum_reg, &sse_reg); + // x_offset = 0 and y_offset = 4 + } else if (y_offset == 4) { + spv32_x0_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, &sum_reg, &sse_reg); + // x_offset = 0 and y_offset = bilin interpolation + } else { + spv32_x0_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, &sum_reg, &sse_reg, y_offset); + } + // x_offset = 4 and y_offset = 0 + } else if (x_offset == 4) { + if (y_offset == 0) { + spv32_x4_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, &sum_reg, &sse_reg); + // x_offset = 4 and y_offset = 4 + } else if (y_offset == 4) { + spv32_x4_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, &sum_reg, &sse_reg); + // x_offset = 4 and y_offset = bilin interpolation + } else { + spv32_x4_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, &sum_reg, &sse_reg, y_offset); + } + // x_offset = bilin interpolation and y_offset = 0 + } else { + if (y_offset == 0) { + spv32_xb_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, &sum_reg, &sse_reg, x_offset); + // x_offset = bilin interpolation and y_offset = 4 + } else if (y_offset == 4) { + spv32_xb_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, &sum_reg, &sse_reg, x_offset); + // x_offset = bilin interpolation and y_offset = bilin interpolation + } else { + spv32_xb_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, &sum_reg, &sse_reg, x_offset, y_offset); + } + } + CALC_SUM_AND_SSE + return sum; +} + +static unsigned int sub_pixel_variance32xh_avx2( + const uint8_t *src, int src_stride, int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, int height, unsigned int *sse) { + return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride, + NULL, 0, 0, height, sse); +} + +static unsigned int sub_pixel_avg_variance32xh_avx2( + const uint8_t *src, int src_stride, int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, + int height, unsigned int *sse) { + return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride, + sec, sec_stride, 1, height, sse); +} + typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse, int *sum); -void vpx_get32x32var_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, unsigned int *sse, - int *sum); - static void variance_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int w, int h, unsigned int *sse, int *sum, get_var_avx2 var_fn, @@ -44,7 +620,7 @@ unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride, int sum; variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, vpx_get16x16var_avx2, 16); - return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 8); } unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride, @@ -60,7 +636,7 @@ unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride, unsigned int *sse) { int sum; variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum, - vpx_get32x32var_avx2, 32); + get32x16var_avx2, 32); return *sse - (uint32_t)(((int64_t)sum * sum) >> 9); } @@ -69,7 +645,7 @@ unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride, unsigned int *sse) { int sum; variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum, - vpx_get32x32var_avx2, 32); + get32x16var_avx2, 32); return *sse - (uint32_t)(((int64_t)sum * sum) >> 10); } @@ -78,7 +654,7 @@ unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride, unsigned int *sse) { int sum; variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum, - vpx_get32x32var_avx2, 32); + get32x16var_avx2, 32); return *sse - (uint32_t)(((int64_t)sum * sum) >> 12); } @@ -87,32 +663,22 @@ unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride, unsigned int *sse) { int sum; variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum, - vpx_get32x32var_avx2, 32); + get32x16var_avx2, 32); return *sse - (uint32_t)(((int64_t)sum * sum) >> 11); } -unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, - int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, - int height, unsigned int *sse); - -unsigned int vpx_sub_pixel_avg_variance32xh_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, - int height, unsigned int *sseptr); - unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src, int src_stride, int x_offset, int y_offset, const uint8_t *dst, int dst_stride, unsigned int *sse) { unsigned int sse1; - const int se1 = vpx_sub_pixel_variance32xh_avx2( + const int se1 = sub_pixel_variance32xh_avx2( src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1); unsigned int sse2; const int se2 = - vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset, - dst + 32, dst_stride, 64, &sse2); + sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset, + dst + 32, dst_stride, 64, &sse2); const int se = se1 + se2; *sse = sse1 + sse2; return *sse - (uint32_t)(((int64_t)se * se) >> 12); @@ -123,7 +689,7 @@ unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src, int y_offset, const uint8_t *dst, int dst_stride, unsigned int *sse) { - const int se = vpx_sub_pixel_variance32xh_avx2( + const int se = sub_pixel_variance32xh_avx2( src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse); return *sse - (uint32_t)(((int64_t)se * se) >> 10); } @@ -132,10 +698,10 @@ unsigned int vpx_sub_pixel_avg_variance64x64_avx2( const uint8_t *src, int src_stride, int x_offset, int y_offset, const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { unsigned int sse1; - const int se1 = vpx_sub_pixel_avg_variance32xh_avx2( + const int se1 = sub_pixel_avg_variance32xh_avx2( src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1); unsigned int sse2; - const int se2 = vpx_sub_pixel_avg_variance32xh_avx2( + const int se2 = sub_pixel_avg_variance32xh_avx2( src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32, 64, 64, &sse2); const int se = se1 + se2; @@ -149,7 +715,7 @@ unsigned int vpx_sub_pixel_avg_variance32x32_avx2( const uint8_t *src, int src_stride, int x_offset, int y_offset, const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { // Process 32 elements in parallel. - const int se = vpx_sub_pixel_avg_variance32xh_avx2( + const int se = sub_pixel_avg_variance32xh_avx2( src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse); return *sse - (uint32_t)(((int64_t)se * se) >> 10); } diff --git a/libvpx/vpx_dsp/x86/variance_impl_avx2.c b/libvpx/vpx_dsp/x86/variance_impl_avx2.c deleted file mode 100644 index 51e6b19ad..000000000 --- a/libvpx/vpx_dsp/x86/variance_impl_avx2.c +++ /dev/null @@ -1,708 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <immintrin.h> // AVX2 - -#include "./vpx_dsp_rtcd.h" -#include "vpx_ports/mem.h" - -/* clang-format off */ -DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { - 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, - 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, - 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, - 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, - 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, - 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, - 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, - 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, - 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, - 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, - 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, - 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, - 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, -}; -/* clang-format on */ - -void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, int recon_stride, - unsigned int *SSE, int *Sum) { - __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; - __m256i ref_expand_high, madd_low, madd_high; - unsigned int i, src_2strides, ref_2strides; - __m256i zero_reg = _mm256_set1_epi16(0); - __m256i sum_ref_src = _mm256_set1_epi16(0); - __m256i madd_ref_src = _mm256_set1_epi16(0); - - // processing two strides in a 256 bit register reducing the number - // of loop stride by half (comparing to the sse2 code) - src_2strides = source_stride << 1; - ref_2strides = recon_stride << 1; - for (i = 0; i < 8; i++) { - src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr))); - src = _mm256_inserti128_si256( - src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1); - - ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr))); - ref = _mm256_inserti128_si256( - ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1); - - // expanding to 16 bit each lane - src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); - src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); - - ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); - ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); - - // src-ref - src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); - src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); - - // madd low (src - ref) - madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); - - // add high to low - src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); - - // madd high (src - ref) - madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); - - sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); - - // add high to low - madd_ref_src = - _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high)); - - src_ptr += src_2strides; - ref_ptr += ref_2strides; - } - - { - __m128i sum_res, madd_res; - __m128i expand_sum_low, expand_sum_high, expand_sum; - __m128i expand_madd_low, expand_madd_high, expand_madd; - __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; - - // extract the low lane and add it to the high lane - sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src), - _mm256_extractf128_si256(sum_ref_src, 1)); - - madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src), - _mm256_extractf128_si256(madd_ref_src, 1)); - - // padding each 2 bytes with another 2 zeroed bytes - expand_sum_low = - _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res); - expand_sum_high = - _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res); - - // shifting the sign 16 bits right - expand_sum_low = _mm_srai_epi32(expand_sum_low, 16); - expand_sum_high = _mm_srai_epi32(expand_sum_high, 16); - - expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high); - - // expand each 32 bits of the madd result to 64 bits - expand_madd_low = - _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg)); - expand_madd_high = - _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg)); - - expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high); - - ex_expand_sum_low = - _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg)); - ex_expand_sum_high = - _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg)); - - ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high); - - // shift 8 bytes eight - madd_res = _mm_srli_si128(expand_madd, 8); - sum_res = _mm_srli_si128(ex_expand_sum, 8); - - madd_res = _mm_add_epi32(madd_res, expand_madd); - sum_res = _mm_add_epi32(sum_res, ex_expand_sum); - - *((int *)SSE) = _mm_cvtsi128_si32(madd_res); - - *((int *)Sum) = _mm_cvtsi128_si32(sum_res); - } -} - -void vpx_get32x32var_avx2(const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, int recon_stride, - unsigned int *SSE, int *Sum) { - __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; - __m256i ref_expand_high, madd_low, madd_high; - unsigned int i; - __m256i zero_reg = _mm256_set1_epi16(0); - __m256i sum_ref_src = _mm256_set1_epi16(0); - __m256i madd_ref_src = _mm256_set1_epi16(0); - - // processing 32 elements in parallel - for (i = 0; i < 16; i++) { - src = _mm256_loadu_si256((__m256i const *)(src_ptr)); - - ref = _mm256_loadu_si256((__m256i const *)(ref_ptr)); - - // expanding to 16 bit each lane - src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); - src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); - - ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); - ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); - - // src-ref - src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); - src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); - - // madd low (src - ref) - madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); - - // add high to low - src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); - - // madd high (src - ref) - madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); - - sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); - - // add high to low - madd_ref_src = - _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high)); - - src_ptr += source_stride; - ref_ptr += recon_stride; - } - - { - __m256i expand_sum_low, expand_sum_high, expand_sum; - __m256i expand_madd_low, expand_madd_high, expand_madd; - __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; - - // padding each 2 bytes with another 2 zeroed bytes - expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src); - expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src); - - // shifting the sign 16 bits right - expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16); - expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16); - - expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high); - - // expand each 32 bits of the madd result to 64 bits - expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg); - expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg); - - expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high); - - ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg); - ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg); - - ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high); - - // shift 8 bytes eight - madd_ref_src = _mm256_srli_si256(expand_madd, 8); - sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8); - - madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd); - sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum); - - // extract the low lane and the high lane and add the results - *((int *)SSE) = - _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) + - _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1)); - - *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) + - _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1)); - } -} - -#define FILTER_SRC(filter) \ - /* filter the source */ \ - exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ - exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ - \ - /* add 8 to source */ \ - exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ - exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ - \ - /* divide source by 16 */ \ - exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ - exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); - -#define MERGE_WITH_SRC(src_reg, reg) \ - exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ - exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); - -#define LOAD_SRC_DST \ - /* load source and destination */ \ - src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ - dst_reg = _mm256_loadu_si256((__m256i const *)(dst)); - -#define AVG_NEXT_SRC(src_reg, size_stride) \ - src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ - /* average between current and next stride source */ \ - src_reg = _mm256_avg_epu8(src_reg, src_next_reg); - -#define MERGE_NEXT_SRC(src_reg, size_stride) \ - src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ - MERGE_WITH_SRC(src_reg, src_next_reg) - -#define CALC_SUM_SSE_INSIDE_LOOP \ - /* expand each byte to 2 bytes */ \ - exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ - exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ - /* source - dest */ \ - exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ - exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ - /* caculate sum */ \ - sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \ - exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ - sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \ - exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ - /* calculate sse */ \ - sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \ - sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); - -// final calculation to sum and sse -#define CALC_SUM_AND_SSE \ - res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ - sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ - sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ - sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ - sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ - sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ - \ - sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ - sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ - \ - sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ - sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ - *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ - _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ - sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ - sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ - sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ - _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); - -unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, - int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, - int height, unsigned int *sse) { - __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; - __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; - __m256i zero_reg; - int i, sum; - sum_reg = _mm256_set1_epi16(0); - sse_reg = _mm256_set1_epi16(0); - zero_reg = _mm256_set1_epi16(0); - - // x_offset = 0 and y_offset = 0 - if (x_offset == 0) { - if (y_offset == 0) { - for (i = 0; i < height; i++) { - LOAD_SRC_DST - // expend each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = 0 and y_offset = 8 - } else if (y_offset == 8) { - __m256i src_next_reg; - for (i = 0; i < height; i++) { - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, src_stride) - // expend each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = 0 and y_offset = bilin interpolation - } else { - __m256i filter, pw8, src_next_reg; - - y_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height; i++) { - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, src_stride) - FILTER_SRC(filter) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - } - // x_offset = 8 and y_offset = 0 - } else if (x_offset == 8) { - if (y_offset == 0) { - __m256i src_next_reg; - for (i = 0; i < height; i++) { - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = 8 and y_offset = 8 - } else if (y_offset == 8) { - __m256i src_next_reg, src_avg; - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height; i++) { - src_avg = src_reg; - src += src_stride; - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - // average between previous average to current average - src_avg = _mm256_avg_epu8(src_avg, src_reg); - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_avg, zero_reg) - // save current source average - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - // x_offset = 8 and y_offset = bilin interpolation - } else { - __m256i filter, pw8, src_next_reg, src_avg; - y_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height; i++) { - // save current source average - src_avg = src_reg; - src += src_stride; - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - MERGE_WITH_SRC(src_avg, src_reg) - FILTER_SRC(filter) - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - } - // x_offset = bilin interpolation and y_offset = 0 - } else { - if (y_offset == 0) { - __m256i filter, pw8, src_next_reg; - x_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height; i++) { - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = bilin interpolation and y_offset = 8 - } else if (y_offset == 8) { - __m256i filter, pw8, src_next_reg, src_pack; - x_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - pw8 = _mm256_set1_epi16(8); - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height; i++) { - src += src_stride; - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - // average between previous pack to the current - src_pack = _mm256_avg_epu8(src_pack, src_reg); - MERGE_WITH_SRC(src_pack, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src_pack = src_reg; - dst += dst_stride; - } - // x_offset = bilin interpolation and y_offset = bilin interpolation - } else { - __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; - x_offset <<= 5; - xfilter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - y_offset <<= 5; - yfilter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - MERGE_NEXT_SRC(src_reg, 1) - - FILTER_SRC(xfilter) - // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height; i++) { - src += src_stride; - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(xfilter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - // merge previous pack to current pack source - MERGE_WITH_SRC(src_pack, src_reg) - // filter the source - FILTER_SRC(yfilter) - src_pack = src_reg; - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - } - } - CALC_SUM_AND_SSE - return sum; -} - -unsigned int vpx_sub_pixel_avg_variance32xh_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, - int height, unsigned int *sse) { - __m256i sec_reg; - __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; - __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; - __m256i zero_reg; - int i, sum; - sum_reg = _mm256_set1_epi16(0); - sse_reg = _mm256_set1_epi16(0); - zero_reg = _mm256_set1_epi16(0); - - // x_offset = 0 and y_offset = 0 - if (x_offset == 0) { - if (y_offset == 0) { - for (i = 0; i < height; i++) { - LOAD_SRC_DST - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec += sec_stride; - // expend each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - } else if (y_offset == 8) { - __m256i src_next_reg; - for (i = 0; i < height; i++) { - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, src_stride) - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec += sec_stride; - // expend each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = 0 and y_offset = bilin interpolation - } else { - __m256i filter, pw8, src_next_reg; - - y_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height; i++) { - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, src_stride) - FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec += sec_stride; - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - } - // x_offset = 8 and y_offset = 0 - } else if (x_offset == 8) { - if (y_offset == 0) { - __m256i src_next_reg; - for (i = 0; i < height; i++) { - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec += sec_stride; - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = 8 and y_offset = 8 - } else if (y_offset == 8) { - __m256i src_next_reg, src_avg; - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height; i++) { - // save current source average - src_avg = src_reg; - src += src_stride; - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - // average between previous average to current average - src_avg = _mm256_avg_epu8(src_avg, src_reg); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_avg = _mm256_avg_epu8(src_avg, sec_reg); - sec += sec_stride; - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_avg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - // x_offset = 8 and y_offset = bilin interpolation - } else { - __m256i filter, pw8, src_next_reg, src_avg; - y_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height; i++) { - // save current source average - src_avg = src_reg; - src += src_stride; - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - MERGE_WITH_SRC(src_avg, src_reg) - FILTER_SRC(filter) - src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_avg = _mm256_avg_epu8(src_avg, sec_reg); - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_avg, zero_reg) - sec += sec_stride; - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - } - // x_offset = bilin interpolation and y_offset = 0 - } else { - if (y_offset == 0) { - __m256i filter, pw8, src_next_reg; - x_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height; i++) { - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - MERGE_WITH_SRC(src_reg, zero_reg) - sec += sec_stride; - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = bilin interpolation and y_offset = 8 - } else if (y_offset == 8) { - __m256i filter, pw8, src_next_reg, src_pack; - x_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - pw8 = _mm256_set1_epi16(8); - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height; i++) { - src += src_stride; - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - // average between previous pack to the current - src_pack = _mm256_avg_epu8(src_pack, src_reg); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_pack = _mm256_avg_epu8(src_pack, sec_reg); - sec += sec_stride; - MERGE_WITH_SRC(src_pack, zero_reg) - src_pack = src_reg; - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - // x_offset = bilin interpolation and y_offset = bilin interpolation - } else { - __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; - x_offset <<= 5; - xfilter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - y_offset <<= 5; - yfilter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - MERGE_NEXT_SRC(src_reg, 1) - - FILTER_SRC(xfilter) - // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height; i++) { - src += src_stride; - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(xfilter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - // merge previous pack to current pack source - MERGE_WITH_SRC(src_pack, src_reg) - // filter the source - FILTER_SRC(yfilter) - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_pack = _mm256_avg_epu8(src_pack, sec_reg); - MERGE_WITH_SRC(src_pack, zero_reg) - src_pack = src_reg; - sec += sec_stride; - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - } - } - CALC_SUM_AND_SSE - return sum; -} diff --git a/libvpx/vpx_dsp/x86/variance_sse2.c b/libvpx/vpx_dsp/x86/variance_sse2.c index 1161da491..8d8bf183b 100644 --- a/libvpx/vpx_dsp/x86/variance_sse2.c +++ b/libvpx/vpx_dsp/x86/variance_sse2.c @@ -222,7 +222,7 @@ unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride, unsigned int *sse) { int sum; vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); - return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 8); } unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride, diff --git a/libvpx/vpx_dsp/x86/vpx_asm_stubs.c b/libvpx/vpx_dsp/x86/vpx_asm_stubs.c index 727d9d115..4f164afeb 100644 --- a/libvpx/vpx_dsp/x86/vpx_asm_stubs.c +++ b/libvpx/vpx_dsp/x86/vpx_asm_stubs.c @@ -41,38 +41,38 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2; // void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); // void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2); +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2); +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2); +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2); // void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); FUN_CONV_2D(, sse2); FUN_CONV_2D(avg_, sse2); @@ -140,22 +140,22 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; // const int16_t *filter_y, // int y_step_q4, // int w, int h, int bd); -HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); -HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); -HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); -HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2); +HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2); +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2); +HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2); // void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h, int bd); // void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h, int bd); +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h, int bd); HIGH_FUN_CONV_2D(, sse2); HIGH_FUN_CONV_2D(avg_, sse2); #endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 diff --git a/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm index 389a692db..3f444e2e6 100644 --- a/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm +++ b/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm @@ -20,14 +20,14 @@ SECTION .text %endif %ifidn %2, highbd %define pavg pavgw -cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ +cglobal %2_convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \ dst, dst_stride, \ - fx, fxs, fy, fys, w, h, bd + f, fxo, fxs, fyo, fys, w, h, bd %else %define pavg pavgb -cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ +cglobal convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \ dst, dst_stride, \ - fx, fxs, fy, fys, w, h + f, fxo, fxs, fyo, fys, w, h %endif mov r4d, dword wm %ifidn %2, highbd diff --git a/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm b/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm index bfc816f23..d83507dc9 100644 --- a/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm +++ b/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm @@ -197,6 +197,8 @@ movdqu [rdi + %2], xmm0 %endm +SECTION .text + ;void vpx_filter_block1d4_v8_sse2 ;( ; unsigned char *src_ptr, diff --git a/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm b/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm index 72f2ff71d..9bffe504b 100644 --- a/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm +++ b/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm @@ -171,6 +171,8 @@ %endm %endif +SECTION .text + global sym(vpx_highbd_filter_block1d4_v2_sse2) PRIVATE sym(vpx_highbd_filter_block1d4_v2_sse2): push rbp diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 7c1ecc014..d0919695c 100644 --- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -12,9 +12,10 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/convolve.h" +#include "vpx_dsp/x86/convolve_avx2.h" #include "vpx_ports/mem.h" -// filters for 16_h8 and 16_v8 +// filters for 16_h8 DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 @@ -35,493 +36,296 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 }; -#if defined(__clang__) -#if (__clang_major__ > 0 && __clang_major__ < 3) || \ - (__clang_major__ == 3 && __clang_minor__ <= 3) || \ - (defined(__APPLE__) && defined(__apple_build_version__) && \ - ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ - (__clang_major__ == 5 && __clang_minor__ == 0))) -#define MM256_BROADCASTSI128_SI256(x) \ - _mm_broadcastsi128_si256((__m128i const *)&(x)) -#else // clang > 3.3, and not 5.0 on macosx. -#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) -#endif // clang <= 3.3 -#elif defined(__GNUC__) -#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) -#define MM256_BROADCASTSI128_SI256(x) \ - _mm_broadcastsi128_si256((__m128i const *)&(x)) -#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 -#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) -#else // gcc > 4.7 -#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) -#endif // gcc <= 4.6 -#else // !(gcc || clang) -#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) -#endif // __clang__ - -static void vpx_filter_block1d16_h8_avx2( +static INLINE void vpx_filter_block1d16_h8_x_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, - ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { - __m128i filtersReg; - __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; - __m256i firstFilters, secondFilters, thirdFilters, forthFilters; - __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; - __m256i srcReg32b1, srcReg32b2, filtersReg32; + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter, + const int avg) { + __m128i outReg1, outReg2; + __m256i outReg32b1, outReg32b2; unsigned int i; ptrdiff_t src_stride, dst_stride; + __m256i f[4], filt[4], s[4]; - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 256 bit register - firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 256 bit register - secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 256 bit register - thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 256 bit register - forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); - - filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); - filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); - filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); - filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); + shuffle_filter_avx2(filter, f); + filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); // multiple the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i -= 2) { + __m256i srcReg; + // load the 2 strides of source - srcReg32b1 = + srcReg = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3))); - srcReg32b1 = _mm256_inserti128_si256( - srcReg32b1, + srcReg = _mm256_inserti128_si256( + srcReg, _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)), 1); // filter the source buffer - srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); - - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); - - // filter the source buffer - srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); - - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16( - srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); + s[1] = _mm256_shuffle_epi8(srcReg, filt[1]); + s[2] = _mm256_shuffle_epi8(srcReg, filt[2]); + s[3] = _mm256_shuffle_epi8(srcReg, filt[3]); + outReg32b1 = convolve8_16_avx2(s, f); // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) - srcReg32b2 = + srcReg = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5))); - srcReg32b2 = _mm256_inserti128_si256( - srcReg32b2, + srcReg = _mm256_inserti128_si256( + srcReg, _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)), 1); - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16( - srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); - - // filter the source buffer - srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); - - // add and saturate the results together - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); - // filter the source buffer - srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); + s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); + s[1] = _mm256_shuffle_epi8(srcReg, filt[1]); + s[2] = _mm256_shuffle_epi8(srcReg, filt[2]); + s[3] = _mm256_shuffle_epi8(srcReg, filt[3]); + outReg32b2 = convolve8_16_avx2(s, f); - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); - - // add and saturate the results together - srcRegFilt32b2_1 = _mm256_adds_epi16( - srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); - srcRegFilt32b2_1 = _mm256_adds_epi16( - srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); - - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); - - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); - srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane + // contain the first and second convolve result respectively + outReg32b1 = _mm256_packus_epi16(outReg32b1, outReg32b2); src_ptr += src_stride; + // average if necessary + outReg1 = _mm256_castsi256_si128(outReg32b1); + outReg2 = _mm256_extractf128_si256(outReg32b1, 1); + if (avg) { + outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); + outReg2 = _mm_avg_epu8( + outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch))); + } + // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, - _mm256_castsi256_si128(srcRegFilt32b1_1)); + _mm_store_si128((__m128i *)output_ptr, outReg1); // save the next 16 bits - _mm_store_si128((__m128i *)(output_ptr + output_pitch), - _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); + _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2); + output_ptr += dst_stride; } // if the number of strides is odd. // process only 16 bytes if (i > 0) { - __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; - __m128i srcRegFilt2, srcRegFilt3; + __m128i srcReg; - srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + // load the first 16 bytes of the last row + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer - srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1_1 = - _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); - - // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); - - // filter the source buffer - srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = - _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); - - // add and saturate the results together - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + s[0] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0]))); + s[1] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1]))); + s[2] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2]))); + s[3] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3]))); + outReg1 = convolve8_8_avx2(s, f); // reading the next 16 bytes // (part of it was being read by earlier read) - srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); - - // add and saturate the results together - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); // filter the source buffer - srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt2_1 = - _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); - - // add and saturate the results together - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); - - // filter the source buffer - srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = - _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); - - // add and saturate the results together - srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); - srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); - - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64)); - - srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64)); - - // shift by 7 bit each 16 bit - srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); - srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); + s[0] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0]))); + s[1] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1]))); + s[2] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2]))); + s[3] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3]))); + outReg2 = convolve8_8_avx2(s, f); + + // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane + // contain the first and second convolve result respectively + outReg1 = _mm_packus_epi16(outReg1, outReg2); + + // average if necessary + if (avg) { + outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); + } // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1); + _mm_store_si128((__m128i *)output_ptr, outReg1); } } -static void vpx_filter_block1d16_v8_avx2( +static void vpx_filter_block1d16_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr, + ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) { + vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride, + output_height, filter, 0); +} + +static void vpx_filter_block1d16_h8_avg_avx2( + const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr, + ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) { + vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride, + output_height, filter, 1); +} + +static INLINE void vpx_filter_block1d16_v8_x_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, - ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { - __m128i filtersReg; - __m256i addFilterReg64; - __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; - __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; - __m256i srcReg32b11, srcReg32b12, filtersReg32; - __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter, + const int avg) { + __m128i outReg1, outReg2; + __m256i srcRegHead1; unsigned int i; ptrdiff_t src_stride, dst_stride; + __m256i f[4], s1[4], s2[4]; - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the - // same data in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 256 bit register - firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 256 bit register - secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 256 bit register - thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 256 bit register - forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + shuffle_filter_avx2(filter, f); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; - // load 16 bytes 7 times in stride of src_pitch - srcReg32b1 = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr))); - srcReg32b2 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); - srcReg32b3 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); - srcReg32b4 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); - srcReg32b5 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); - srcReg32b6 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); - srcReg32b7 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); - - // have each consecutive loads on the same 256 register - srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, - _mm256_castsi256_si128(srcReg32b2), 1); - srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, - _mm256_castsi256_si128(srcReg32b3), 1); - srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, - _mm256_castsi256_si128(srcReg32b4), 1); - srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, - _mm256_castsi256_si128(srcReg32b5), 1); - srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, - _mm256_castsi256_si128(srcReg32b6), 1); - srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, - _mm256_castsi256_si128(srcReg32b7), 1); - - // merge every two consecutive registers except the last one - srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); - srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); - - // save - srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); - - // save - srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); - - // save - srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); - - // save - srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); + { + __m128i s[6]; + __m256i s32b[6]; + + // load 16 bytes 7 times in stride of src_pitch + s[0] = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + 5 * src_pitch)); + srcRegHead1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + 6 * src_pitch))); + + // have each consecutive loads on the same 256 register + s32b[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1); + s32b[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1); + s32b[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1); + s32b[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1); + s32b[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1); + s32b[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), + _mm256_castsi256_si128(srcRegHead1), 1); + + // merge every two consecutive registers except the last one + // the first lanes contain values for filtering odd rows (1,3,5...) and + // the second lanes contain values for filtering even rows (2,4,6...) + s1[0] = _mm256_unpacklo_epi8(s32b[0], s32b[1]); + s2[0] = _mm256_unpackhi_epi8(s32b[0], s32b[1]); + s1[1] = _mm256_unpacklo_epi8(s32b[2], s32b[3]); + s2[1] = _mm256_unpackhi_epi8(s32b[2], s32b[3]); + s1[2] = _mm256_unpacklo_epi8(s32b[4], s32b[5]); + s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]); + } for (i = output_height; i > 1; i -= 2) { - // load the last 2 loads of 16 bytes and have every two + __m256i srcRegHead2, srcRegHead3; + + // load the next 2 loads of 16 bytes and have every two // consecutive loads in the same 256 bit register - srcReg32b8 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); - srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, - _mm256_castsi256_si128(srcReg32b8), 1); - srcReg32b9 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); - srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, - _mm256_castsi256_si128(srcReg32b9), 1); - - // merge every two consecutive registers - // save - srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); - srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); - srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); - - // add and saturate the results together - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); - srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); - - // add and saturate the results together - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, - _mm256_min_epi16(srcReg32b8, srcReg32b12)); - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, - _mm256_max_epi16(srcReg32b8, srcReg32b12)); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); - srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); - - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); - srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); - - // add and saturate the results together - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_min_epi16(srcReg32b8, srcReg32b12)); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_max_epi16(srcReg32b8, srcReg32b12)); - - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); - srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); + srcRegHead2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + 7 * src_pitch))); + srcRegHead1 = _mm256_inserti128_si256( + srcRegHead1, _mm256_castsi256_si128(srcRegHead2), 1); + srcRegHead3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + 8 * src_pitch))); + srcRegHead2 = _mm256_inserti128_si256( + srcRegHead2, _mm256_castsi256_si128(srcRegHead3), 1); + + // merge the two new consecutive registers + // the first lane contain values for filtering odd rows (1,3,5...) and + // the second lane contain values for filtering even rows (2,4,6...) + s1[3] = _mm256_unpacklo_epi8(srcRegHead1, srcRegHead2); + s2[3] = _mm256_unpackhi_epi8(srcRegHead1, srcRegHead2); + + s1[0] = convolve8_16_avx2(s1, f); + s2[0] = convolve8_16_avx2(s2, f); + + // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane + // contain the first and second convolve result respectively + s1[0] = _mm256_packus_epi16(s1[0], s2[0]); src_ptr += src_stride; + // average if necessary + outReg1 = _mm256_castsi256_si128(s1[0]); + outReg2 = _mm256_extractf128_si256(s1[0], 1); + if (avg) { + outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); + outReg2 = _mm_avg_epu8( + outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch))); + } + // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1)); + _mm_store_si128((__m128i *)output_ptr, outReg1); // save the next 16 bits - _mm_store_si128((__m128i *)(output_ptr + out_pitch), - _mm256_extractf128_si256(srcReg32b1, 1)); + _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2); output_ptr += dst_stride; - // save part of the registers for next strides - srcReg32b10 = srcReg32b11; - srcReg32b1 = srcReg32b3; - srcReg32b11 = srcReg32b2; - srcReg32b3 = srcReg32b5; - srcReg32b2 = srcReg32b4; - srcReg32b5 = srcReg32b7; - srcReg32b7 = srcReg32b9; + // shift down by two rows + s1[0] = s1[1]; + s2[0] = s2[1]; + s1[1] = s1[2]; + s2[1] = s2[2]; + s1[2] = s1[3]; + s2[2] = s2[3]; + srcRegHead1 = srcRegHead3; } + + // if the number of strides is odd. + // process only 16 bytes if (i > 0) { - __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; - __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; // load the last 16 bytes - srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); + const __m128i srcRegHead2 = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the last 2 results together - srcRegFilt4 = - _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); - srcRegFilt7 = - _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), - _mm256_castsi256_si128(firstFilters)); - srcRegFilt4 = - _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); - srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), - _mm256_castsi256_si128(firstFilters)); - srcRegFilt7 = - _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); - - // add and saturate the results together - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), - _mm256_castsi256_si128(secondFilters)); - srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), - _mm256_castsi256_si128(secondFilters)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), - _mm256_castsi256_si128(thirdFilters)); - srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), - _mm256_castsi256_si128(thirdFilters)); - - // add and saturate the results together - srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6)); - srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7)); - - // add and saturate the results together - srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6)); - srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7)); - - srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64)); - srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64)); - - // shift by 7 bit each 16 bit - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); + s1[0] = _mm256_castsi128_si256( + _mm_unpacklo_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2)); + s2[0] = _mm256_castsi128_si256( + _mm_unpackhi_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2)); + + outReg1 = convolve8_8_avx2(s1, f); + outReg2 = convolve8_8_avx2(s2, f); + + // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane + // contain the first and second convolve result respectively + outReg1 = _mm_packus_epi16(outReg1, outReg2); + + // average if necessary + if (avg) { + outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); + } // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, srcRegFilt1); + _mm_store_si128((__m128i *)output_ptr, outReg1); } } +static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *filter) { + vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride, + height, filter, 0); +} + +static void vpx_filter_block1d16_v8_avg_avx2( + const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *filter) { + vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride, + height, filter, 1); +} + #if HAVE_AVX2 && HAVE_SSSE3 filter8_1dfunction vpx_filter_block1d4_v8_ssse3; #if ARCH_X86_64 @@ -539,6 +343,14 @@ filter8_1dfunction vpx_filter_block1d4_h8_ssse3; #define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3 #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3 #endif // ARCH_X86_64 +filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3; +#define vpx_filter_block1d8_v8_avg_avx2 vpx_filter_block1d8_v8_avg_ssse3 +#define vpx_filter_block1d8_h8_avg_avx2 vpx_filter_block1d8_h8_avg_ssse3 +#define vpx_filter_block1d4_v8_avg_avx2 vpx_filter_block1d4_v8_avg_ssse3 +#define vpx_filter_block1d4_h8_avg_avx2 vpx_filter_block1d4_h8_avg_ssse3 filter8_1dfunction vpx_filter_block1d16_v2_ssse3; filter8_1dfunction vpx_filter_block1d16_h2_ssse3; filter8_1dfunction vpx_filter_block1d8_v2_ssse3; @@ -552,23 +364,53 @@ filter8_1dfunction vpx_filter_block1d4_h2_ssse3; #define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3 #define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3 #define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3 +filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; +#define vpx_filter_block1d16_v2_avg_avx2 vpx_filter_block1d16_v2_avg_ssse3 +#define vpx_filter_block1d16_h2_avg_avx2 vpx_filter_block1d16_h2_avg_ssse3 +#define vpx_filter_block1d8_v2_avg_avx2 vpx_filter_block1d8_v2_avg_ssse3 +#define vpx_filter_block1d8_h2_avg_avx2 vpx_filter_block1d8_h2_avg_ssse3 +#define vpx_filter_block1d4_v2_avg_avx2 vpx_filter_block1d4_v2_avg_ssse3 +#define vpx_filter_block1d4_h2_avg_avx2 vpx_filter_block1d4_h2_avg_ssse3 // void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); +// void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); +// void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2); +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2); +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2); +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2); // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); +// void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); FUN_CONV_2D(, avx2); +FUN_CONV_2D(avg_, avx2); #endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index 09c75d455..e4f992780 100644 --- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -8,52 +8,37 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include <tmmintrin.h> +#include <tmmintrin.h> // SSSE3 + +#include <string.h> #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/vpx_filter.h" #include "vpx_dsp/x86/convolve.h" +#include "vpx_dsp/x86/convolve_ssse3.h" +#include "vpx_dsp/x86/mem_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" -#include "vpx_ports/emmintrin_compat.h" - -// filters only for the 4_h8 convolution -DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { - 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 -}; - -// filters for 8_h8 and 16_h8 -DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 -}; // These are reused by the avx2 intrinsics. -filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; -filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; +// vpx_filter_block1d8_v8_intrin_ssse3() +// vpx_filter_block1d8_h8_intrin_ssse3() +// vpx_filter_block1d4_h8_intrin_ssse3() + +static INLINE __m128i shuffle_filter_convolve8_8_ssse3( + const __m128i *const s, const int16_t *const filter) { + __m128i f[4]; + shuffle_filter_ssse3(filter, f); + return convolve8_8_ssse3(s, f); +} void vpx_filter_block1d4_h8_intrin_ssse3( - const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i firstFilters, secondFilters, shuffle1, shuffle2; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; - __m128i addFilterReg64, filtersReg, srcReg, minReg; + __m128i srcRegFilt1, srcRegFilt2; + __m128i addFilterReg64, filtersReg, srcReg; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 @@ -75,8 +60,8 @@ void vpx_filter_block1d4_h8_intrin_ssse3( secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); // loading the local filters - shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8); - shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); + shuffle1 = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6); + shuffle2 = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10); for (i = 0; i < output_height; i++) { srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); @@ -89,25 +74,23 @@ void vpx_filter_block1d4_h8_intrin_ssse3( srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); - // extract the higher half of the lane - srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); - srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); + // sum the results together, saturating only on the final step + // the specific order of the additions prevents outranges + srcRegFilt1 = _mm_add_epi16(srcRegFilt1, srcRegFilt2); - minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); + // extract the higher half of the register + srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8); - // add and saturate all the results together - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + // add the rounding offset early to avoid another saturated add + srcRegFilt1 = _mm_add_epi16(srcRegFilt1, addFilterReg64); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); // shift by 7 bit each 16 bits srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); - src_ptr += src_pixels_per_line; + src_ptr += src_pitch; // save only 4 bytes *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1); @@ -117,77 +100,35 @@ void vpx_filter_block1d4_h8_intrin_ssse3( } void vpx_filter_block1d8_h8_intrin_ssse3( - const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { - __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; - __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; - __m128i addFilterReg64, filtersReg, minReg; unsigned int i; + __m128i f[4], filt[4], s[4]; - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 128 bit register - firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 128 bit register - secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 128 bit register - thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 128 bit register - forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); - - filt1Reg = _mm_load_si128((__m128i const *)filt1_global); - filt2Reg = _mm_load_si128((__m128i const *)filt2_global); - filt3Reg = _mm_load_si128((__m128i const *)filt3_global); - filt4Reg = _mm_load_si128((__m128i const *)filt4_global); + shuffle_filter_ssse3(filter, f); + filt[0] = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + filt[1] = _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + filt[2] = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12); + filt[3] = + _mm_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14); for (i = 0; i < output_height; i++) { - srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + const __m128i srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer - srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg); - srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); - - // filter the source buffer - srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg); - srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); - srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); - - // add and saturate all the results together - minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - - srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bits - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + s[0] = _mm_shuffle_epi8(srcReg, filt[0]); + s[1] = _mm_shuffle_epi8(srcReg, filt[1]); + s[2] = _mm_shuffle_epi8(srcReg, filt[2]); + s[3] = _mm_shuffle_epi8(srcReg, filt[3]); + s[0] = convolve8_8_ssse3(s, f); // shrink to 8 bit each 16 bits - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + s[0] = _mm_packus_epi16(s[0], s[0]); - src_ptr += src_pixels_per_line; + src_ptr += src_pitch; // save only 8 bytes - _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); + _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]); output_ptr += output_pitch; } @@ -196,83 +137,49 @@ void vpx_filter_block1d8_h8_intrin_ssse3( void vpx_filter_block1d8_v8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { - __m128i addFilterReg64, filtersReg, minReg; - __m128i firstFilters, secondFilters, thirdFilters, forthFilters; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; - __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; - __m128i srcReg8; unsigned int i; + __m128i f[4], s[8], ss[4]; - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits in the filter - firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); - // duplicate only the second 16 bits in the filter - secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); - // duplicate only the third 16 bits in the filter - thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); - // duplicate only the forth 16 bits in the filter - forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + shuffle_filter_ssse3(filter, f); // load the first 7 rows of 8 bytes - srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); - srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); - srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); - srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); - srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch)); + s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch)); for (i = 0; i < output_height; i++) { // load the last 8 bytes - srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); // merge the result together - srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); - srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); + ss[0] = _mm_unpacklo_epi8(s[0], s[1]); + ss[1] = _mm_unpacklo_epi8(s[2], s[3]); // merge the result together - srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); - srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); - srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); - - // add and saturate the results together - minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); - srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + ss[2] = _mm_unpacklo_epi8(s[4], s[5]); + ss[3] = _mm_unpacklo_epi8(s[6], s[7]); + ss[0] = convolve8_8_ssse3(ss, f); // shrink to 8 bit each 16 bits - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + ss[0] = _mm_packus_epi16(ss[0], ss[0]); src_ptr += src_pitch; // shift down a row - srcReg1 = srcReg2; - srcReg2 = srcReg3; - srcReg3 = srcReg4; - srcReg4 = srcReg5; - srcReg5 = srcReg6; - srcReg6 = srcReg7; - srcReg7 = srcReg8; + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + s[3] = s[4]; + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; // save only 8 bytes convolve result - _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); + _mm_storel_epi64((__m128i *)&output_ptr[0], ss[0]); output_ptr += out_pitch; } @@ -306,149 +213,69 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; // void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); // void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - ssse3); - -#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \ - const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \ - const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \ - const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \ - const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \ - const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \ - \ - const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \ - const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \ - const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \ - const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \ - \ - out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \ - out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \ - out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \ - out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \ - out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \ - out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \ - out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \ - out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \ - } - -static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *x_filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)x_filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_loadl_epi64((const __m128i *)src_x); - const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch)); - const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2)); - const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3)); - const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4)); - const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5)); - const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6)); - const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7)); - // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 - const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); - // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 - const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); - // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57 - const __m128i tr0_2 = _mm_unpacklo_epi16(E, F); - // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77 - const __m128i tr0_3 = _mm_unpacklo_epi16(G, H); - // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 - const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1); - // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73 - const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3); - // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77 - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 - const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2); - const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2); - const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3); - const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3); +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , ssse3); +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3); +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, ssse3); + +static void filter_horiz_w8_ssse3(const uint8_t *const src, + const ptrdiff_t src_stride, + uint8_t *const dst, + const int16_t *const x_filter) { + __m128i s[8], ss[4], temp; + + load_8bit_8x8(src, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + transpose_16bit_4x8(s, ss); + temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter); // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)dst, temp); } -static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride) { - __m128i A, B, C, D, E, F, G, H; - - A = _mm_loadl_epi64((const __m128i *)src); - B = _mm_loadl_epi64((const __m128i *)(src + src_stride)); - C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); - D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); - E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4)); - F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5)); - G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6)); - H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7)); - - TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H); - - _mm_storel_epi64((__m128i *)dst, A); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H); +static void transpose8x8_to_dst(const uint8_t *const src, + const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride) { + __m128i s[8]; + + load_8bit_8x8(src, src_stride, s); + transpose_8bit_8x8(s, s); + store_8bit_8x8(s, dst, dst_stride); } -static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, int x0_q4, - int x_step_q4, int w, int h) { +static void scaledconvolve_horiz_w8(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const InterpKernel *const x_filters, + const int x0_q4, const int x_step_q4, + const int w, const int h) { DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); int x, y, z; src -= SUBPEL_TAPS / 2 - 1; - // This function processes 8x8 areas. The intermediate height is not always + // This function processes 8x8 areas. The intermediate height is not always // a multiple of 8, so force it to be a multiple of 8 here. y = h + (8 - (h & 0x7)); @@ -479,93 +306,50 @@ static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride, } while (y -= 8); } -static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); - const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); - const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - // TRANSPOSE... - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // - // TO - // - // 00 10 20 30 - // 01 11 21 31 - // 02 12 22 32 - // 03 13 23 33 - // 04 14 24 34 - // 05 15 25 35 - // 06 16 26 36 - // 07 17 27 37 - // - // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 - const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); - // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 - const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); - // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 - const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 - const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1); +static void filter_horiz_w4_ssse3(const uint8_t *const src, + const ptrdiff_t src_stride, + uint8_t *const dst, + const int16_t *const filter) { + __m128i s[4], ss[2]; + __m128i temp; + + load_8bit_8x4(src, src_stride, s); + transpose_16bit_4x4(s, ss); + // 00 01 10 11 20 21 30 31 + s[0] = ss[0]; // 02 03 12 13 22 23 32 33 - const __m128i s3s2 = _mm_srli_si128(s1s0, 8); + s[1] = _mm_srli_si128(ss[0], 8); + // 04 05 14 15 24 25 34 35 + s[2] = ss[1]; // 06 07 16 17 26 27 36 37 - const __m128i s7s6 = _mm_srli_si128(s5s4, 8); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); + s[3] = _mm_srli_si128(ss[1], 8); + + temp = shuffle_filter_convolve8_8_ssse3(s, filter); // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 4 bytes *(int *)dst = _mm_cvtsi128_si32(temp); } -static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride) { - __m128i A = _mm_cvtsi32_si128(*(const int *)src); - __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride)); - __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2)); - __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3)); - // 00 10 01 11 02 12 03 13 - const __m128i tr0_0 = _mm_unpacklo_epi8(A, B); - // 20 30 21 31 22 32 23 33 - const __m128i tr0_1 = _mm_unpacklo_epi8(C, D); - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - A = _mm_unpacklo_epi16(tr0_0, tr0_1); - B = _mm_srli_si128(A, 4); - C = _mm_srli_si128(A, 8); - D = _mm_srli_si128(A, 12); - - *(int *)(dst) = _mm_cvtsi128_si32(A); - *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B); - *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C); - *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D); +static void transpose4x4_to_dst(const uint8_t *const src, + const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride) { + __m128i s[4]; + + load_8bit_4x4(src, src_stride, s); + s[0] = transpose_8bit_4x4(s); + s[1] = _mm_srli_si128(s[0], 4); + s[2] = _mm_srli_si128(s[0], 8); + s[3] = _mm_srli_si128(s[0], 12); + store_8bit_4x4(s, dst, dst_stride); } -static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, int x0_q4, - int x_step_q4, int w, int h) { +static void scaledconvolve_horiz_w4(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const InterpKernel *const x_filters, + const int x0_q4, const int x_step_q4, + const int w, const int h) { DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); int x, y, z; src -= SUBPEL_TAPS / 2 - 1; @@ -597,50 +381,41 @@ static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride, } } -static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr); - const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch)); - const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2)); - const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3)); - const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4)); - const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5)); - const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6)); - const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7)); - const __m128i s1s0 = _mm_unpacklo_epi8(A, B); - const __m128i s3s2 = _mm_unpacklo_epi8(C, D); - const __m128i s5s4 = _mm_unpacklo_epi8(E, F); - const __m128i s7s6 = _mm_unpacklo_epi8(G, H); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); +static __m128i filter_vert_kernel(const __m128i *const s, + const int16_t *const filter) { + __m128i ss[4]; + __m128i temp; + + // 00 10 01 11 02 12 03 13 + ss[0] = _mm_unpacklo_epi8(s[0], s[1]); + // 20 30 21 31 22 32 23 33 + ss[1] = _mm_unpacklo_epi8(s[2], s[3]); + // 40 50 41 51 42 52 43 53 + ss[2] = _mm_unpacklo_epi8(s[4], s[5]); + // 60 70 61 71 62 72 63 73 + ss[3] = _mm_unpacklo_epi8(s[6], s[7]); + + temp = shuffle_filter_convolve8_8_ssse3(ss, filter); // shrink to 8 bit each 16 bits - temp = _mm_packus_epi16(temp, temp); + return _mm_packus_epi16(temp, temp); +} + +static void filter_vert_w4_ssse3(const uint8_t *const src, + const ptrdiff_t src_stride, uint8_t *const dst, + const int16_t *const filter) { + __m128i s[8]; + __m128i temp; + + load_8bit_4x8(src, src_stride, s); + temp = filter_vert_kernel(s, filter); // save only 4 bytes *(int *)dst = _mm_cvtsi128_si32(temp); } -static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { +static void scaledconvolve_vert_w4( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { int y; int y_q4 = y0_q4; @@ -659,50 +434,21 @@ static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride, } } -static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); - const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); - const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); - const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); - const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); - const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); - const __m128i s1s0 = _mm_unpacklo_epi8(A, B); - const __m128i s3s2 = _mm_unpacklo_epi8(C, D); - const __m128i s5s4 = _mm_unpacklo_epi8(E, F); - const __m128i s7s6 = _mm_unpacklo_epi8(G, H); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); - // shrink to 8 bit each 16 bits - temp = _mm_packus_epi16(temp, temp); +static void filter_vert_w8_ssse3(const uint8_t *const src, + const ptrdiff_t src_stride, uint8_t *const dst, + const int16_t *const filter) { + __m128i s[8], temp; + + load_8bit_8x8(src, src_stride, s); + temp = filter_vert_kernel(s, filter); // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)dst, temp); } -static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { +static void scaledconvolve_vert_w8( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { int y; int y_q4 = y0_q4; @@ -719,81 +465,44 @@ static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride, } } -static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter, int w) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); +static void filter_vert_w16_ssse3(const uint8_t *src, + const ptrdiff_t src_stride, + uint8_t *const dst, + const int16_t *const filter, const int w) { int i; + __m128i f[4]; + shuffle_filter_ssse3(filter, f); for (i = 0; i < w; i += 16) { - const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr); - const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); - const __m128i C = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); - const __m128i D = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); - const __m128i E = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); - const __m128i F = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); - const __m128i G = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); - const __m128i H = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); - // merge the result together - const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B); - const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H); - const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B); - const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0); - const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6); - const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0); - const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6); - // add and saturate the results together - const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo); - const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi); - // merge the result together - const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D); - const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2); - const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2); + __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi; + + loadu_8bit_16x8(src, src_stride, s); + // merge the result together - const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F); - const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4); - const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4); - // add and saturate the results together - __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo)); - __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi)); - - // add and saturate the results together - temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo)); - temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi)); - // round and shift by 7 bit each 16 bit - temp_lo = _mm_mulhrs_epi16(temp_lo, k_256); - temp_hi = _mm_mulhrs_epi16(temp_hi, k_256); - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result + s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]); + s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]); + s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]); + s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]); + s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]); + s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]); + s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]); + s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]); + temp_lo = convolve8_8_ssse3(s_lo, f); + temp_hi = convolve8_8_ssse3(s_hi, f); + + // shrink to 8 bit each 16 bits, the first lane contain the first convolve + // result and the second lane contain the second convolve result temp_hi = _mm_packus_epi16(temp_lo, temp_hi); - src_ptr += 16; + src += 16; // save 16 bytes convolve result _mm_store_si128((__m128i *)&dst[i], temp_hi); } } -static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { +static void scaledconvolve_vert_w16( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { int y; int y_q4 = y0_q4; @@ -811,11 +520,10 @@ static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride, } } -static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, int x0_q4, - int x_step_q4, const InterpKernel *const y_filters, - int y0_q4, int y_step_q4, int w, int h) { +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { // Note: Fixed size intermediate buffer, temp, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. @@ -829,60 +537,49 @@ static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride, // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. // --Require an additional 8 rows for the horiz_w8 transpose tail. + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); const int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; assert(w <= 64); assert(h <= 64); - assert(y_step_q4 <= 32); - assert(x_step_q4 <= 32); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); if (w >= 8) { scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, 64, x_filters, x0_q4, x_step_q4, - w, intermediate_height); + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); } else { scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, 64, x_filters, x0_q4, x_step_q4, - w, intermediate_height); + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); } if (w >= 16) { scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, y_filters, y0_q4, y_step_q4, w, h); + dst_stride, filter, y0_q4, y_step_q4, w, h); } else if (w == 8) { scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, y_filters, y0_q4, y_step_q4, w, h); + dst_stride, filter, y0_q4, y_step_q4, w, h); } else { scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, y_filters, y0_q4, y_step_q4, w, h); + dst_stride, filter, y0_q4, y_step_q4, w, h); } } -void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4, - x_step_q4, filters_y, y0_q4, y_step_q4, w, h); -} - // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); FUN_CONV_2D(, ssse3); FUN_CONV_2D(avg_, ssse3); diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm index 08f3d6a6c..8497e1721 100644 --- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm +++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm @@ -176,6 +176,8 @@ movq [rdi + %2], xmm0 %endm +SECTION .text + ;void vpx_filter_block1d4_v8_sse2 ;( ; unsigned char *src_ptr, diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm index c1a6f23ab..952d9307d 100644 --- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm +++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm @@ -327,12 +327,12 @@ cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ %endm INIT_XMM ssse3 -SUBPIX_HFILTER16 h8 -SUBPIX_HFILTER16 h8_avg -SUBPIX_HFILTER8 h8 -SUBPIX_HFILTER8 h8_avg -SUBPIX_HFILTER4 h8 -SUBPIX_HFILTER4 h8_avg +SUBPIX_HFILTER16 h8 ; vpx_filter_block1d16_h8_ssse3 +SUBPIX_HFILTER16 h8_avg ; vpx_filter_block1d16_h8_avg_ssse3 +SUBPIX_HFILTER8 h8 ; vpx_filter_block1d8_h8_ssse3 +SUBPIX_HFILTER8 h8_avg ; vpx_filter_block1d8_h8_avg_ssse3 +SUBPIX_HFILTER4 h8 ; vpx_filter_block1d4_h8_ssse3 +SUBPIX_HFILTER4 h8_avg ; vpx_filter_block1d4_h8_avg_ssse3 ;------------------------------------------------------------------------------- @@ -795,9 +795,9 @@ cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ %endm INIT_XMM ssse3 -SUBPIX_VFILTER16 v8 -SUBPIX_VFILTER16 v8_avg -SUBPIX_VFILTER v8, 8 -SUBPIX_VFILTER v8_avg, 8 -SUBPIX_VFILTER v8, 4 -SUBPIX_VFILTER v8_avg, 4 +SUBPIX_VFILTER16 v8 ; vpx_filter_block1d16_v8_ssse3 +SUBPIX_VFILTER16 v8_avg ; vpx_filter_block1d16_v8_avg_ssse3 +SUBPIX_VFILTER v8, 8 ; vpx_filter_block1d8_v8_ssse3 +SUBPIX_VFILTER v8_avg, 8 ; vpx_filter_block1d8_v8_avg_ssse3 +SUBPIX_VFILTER v8, 4 ; vpx_filter_block1d4_v8_ssse3 +SUBPIX_VFILTER v8_avg, 4 ; vpx_filter_block1d4_v8_avg_ssse3 diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm b/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm index a378dd040..6d79492e4 100644 --- a/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm +++ b/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm @@ -131,6 +131,8 @@ dec rcx %endm +SECTION .text + global sym(vpx_filter_block1d4_v2_sse2) PRIVATE sym(vpx_filter_block1d4_v2_sse2): push rbp diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm b/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm index 538b2129d..8c9c817be 100644 --- a/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm +++ b/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm @@ -105,6 +105,8 @@ dec rcx %endm +SECTION .text + global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE sym(vpx_filter_block1d4_v2_ssse3): push rbp diff --git a/libvpx/vpx_mem/vpx_mem.c b/libvpx/vpx_mem/vpx_mem.c index a9be08680..eeba34c37 100644 --- a/libvpx/vpx_mem/vpx_mem.c +++ b/libvpx/vpx_mem/vpx_mem.c @@ -82,12 +82,3 @@ void vpx_free(void *memblk) { free(addr); } } - -#if CONFIG_VP9_HIGHBITDEPTH -void *vpx_memset16(void *dest, int val, size_t length) { - size_t i; - uint16_t *dest16 = (uint16_t *)dest; - for (i = 0; i < length; i++) *dest16++ = val; - return dest; -} -#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/vpx_mem/vpx_mem.h b/libvpx/vpx_mem/vpx_mem.h index 733aff488..a4274b885 100644 --- a/libvpx/vpx_mem/vpx_mem.h +++ b/libvpx/vpx_mem/vpx_mem.h @@ -19,6 +19,8 @@ #include <stdlib.h> #include <stddef.h> +#include "vpx/vpx_integer.h" + #if defined(__cplusplus) extern "C" { #endif @@ -29,7 +31,12 @@ void *vpx_calloc(size_t num, size_t size); void vpx_free(void *memblk); #if CONFIG_VP9_HIGHBITDEPTH -void *vpx_memset16(void *dest, int val, size_t length); +static INLINE void *vpx_memset16(void *dest, int val, size_t length) { + size_t i; + uint16_t *dest16 = (uint16_t *)dest; + for (i = 0; i < length; i++) *dest16++ = val; + return dest; +} #endif #include <string.h> diff --git a/libvpx/vpx_ports/asmdefs_mmi.h b/libvpx/vpx_ports/asmdefs_mmi.h new file mode 100644 index 000000000..a9a49745a --- /dev/null +++ b/libvpx/vpx_ports/asmdefs_mmi.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_PORTS_ASMDEFS_MMI_H_ +#define VPX_PORTS_ASMDEFS_MMI_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +#if HAVE_MMI + +#if HAVE_MIPS64 +#define mips_reg int64_t +#define MMI_ADDU(reg1, reg2, reg3) \ + "daddu " #reg1 ", " #reg2 ", " #reg3 " \n\t" + +#define MMI_ADDIU(reg1, reg2, immediate) \ + "daddiu " #reg1 ", " #reg2 ", " #immediate " \n\t" + +#define MMI_ADDI(reg1, reg2, immediate) \ + "daddi " #reg1 ", " #reg2 ", " #immediate " \n\t" + +#define MMI_SUBU(reg1, reg2, reg3) \ + "dsubu " #reg1 ", " #reg2 ", " #reg3 " \n\t" + +#define MMI_L(reg, addr, bias) \ + "ld " #reg ", " #bias "(" #addr ") \n\t" + +#define MMI_SRL(reg1, reg2, shift) \ + "dsrl " #reg1 ", " #reg2 ", " #shift " \n\t" + +#define MMI_SLL(reg1, reg2, shift) \ + "dsll " #reg1 ", " #reg2 ", " #shift " \n\t" + +#define MMI_MTC1(reg, fp) \ + "dmtc1 " #reg ", " #fp " \n\t" + +#define MMI_LI(reg, immediate) \ + "dli " #reg ", " #immediate " \n\t" + +#else +#define mips_reg int32_t +#define MMI_ADDU(reg1, reg2, reg3) \ + "addu " #reg1 ", " #reg2 ", " #reg3 " \n\t" + +#define MMI_ADDIU(reg1, reg2, immediate) \ + "addiu " #reg1 ", " #reg2 ", " #immediate " \n\t" + +#define MMI_ADDI(reg1, reg2, immediate) \ + "addi " #reg1 ", " #reg2 ", " #immediate " \n\t" + +#define MMI_SUBU(reg1, reg2, reg3) \ + "subu " #reg1 ", " #reg2 ", " #reg3 " \n\t" + +#define MMI_L(reg, addr, bias) \ + "lw " #reg ", " #bias "(" #addr ") \n\t" + +#define MMI_SRL(reg1, reg2, shift) \ + "srl " #reg1 ", " #reg2 ", " #shift " \n\t" + +#define MMI_SLL(reg1, reg2, shift) \ + "sll " #reg1 ", " #reg2 ", " #shift " \n\t" + +#define MMI_MTC1(reg, fp) \ + "mtc1 " #reg ", " #fp " \n\t" + +#define MMI_LI(reg, immediate) \ + "li " #reg ", " #immediate " \n\t" + +#endif /* HAVE_MIPS64 */ + +#endif /* HAVE_MMI */ + +#endif /* VPX_PORTS_ASMDEFS_MMI_H_ */ diff --git a/libvpx/vpx_ports/vpx_ports.mk b/libvpx/vpx_ports/vpx_ports.mk index fc0a783b7..e17145e6c 100644 --- a/libvpx/vpx_ports/vpx_ports.mk +++ b/libvpx/vpx_ports/vpx_ports.mk @@ -28,3 +28,7 @@ PORTS_SRCS-$(ARCH_ARM) += arm.h PORTS_SRCS-$(ARCH_PPC) += ppc_cpudetect.c PORTS_SRCS-$(ARCH_PPC) += ppc.h + +ifeq ($(ARCH_MIPS), yes) +PORTS_SRCS-yes += asmdefs_mmi.h +endif diff --git a/libvpx/vpx_ports/x86.h b/libvpx/vpx_ports/x86.h index 5aabb9e3a..ced65ac05 100644 --- a/libvpx/vpx_ports/x86.h +++ b/libvpx/vpx_ports/x86.h @@ -151,16 +151,17 @@ static INLINE uint64_t xgetbv(void) { #endif #endif -#define HAS_MMX 0x01 -#define HAS_SSE 0x02 -#define HAS_SSE2 0x04 -#define HAS_SSE3 0x08 -#define HAS_SSSE3 0x10 -#define HAS_SSE4_1 0x20 -#define HAS_AVX 0x40 -#define HAS_AVX2 0x80 +#define HAS_MMX 0x001 +#define HAS_SSE 0x002 +#define HAS_SSE2 0x004 +#define HAS_SSE3 0x008 +#define HAS_SSSE3 0x010 +#define HAS_SSE4_1 0x020 +#define HAS_AVX 0x040 +#define HAS_AVX2 0x080 +#define HAS_AVX512 0x100 #ifndef BIT -#define BIT(n) (1 << n) +#define BIT(n) (1u << n) #endif static INLINE int x86_simd_caps(void) { @@ -209,6 +210,12 @@ static INLINE int x86_simd_caps(void) { cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); if (reg_ebx & BIT(5)) flags |= HAS_AVX2; + + // bits 16 (AVX-512F) & 17 (AVX-512DQ) & 28 (AVX-512CD) & + // 30 (AVX-512BW) & 32 (AVX-512VL) + if ((reg_ebx & (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))) == + (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))) + flags |= HAS_AVX512; } } } diff --git a/libvpx/vpx_scale/generic/yv12config.c b/libvpx/vpx_scale/generic/yv12config.c index a674eac84..9c7ca42c7 100644 --- a/libvpx/vpx_scale/generic/yv12config.c +++ b/libvpx/vpx_scale/generic/yv12config.c @@ -9,6 +9,7 @@ */ #include <assert.h> +#include <limits.h> #include "vpx_scale/yv12config.h" #include "vpx_mem/vpx_mem.h" @@ -165,6 +166,12 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, uint8_t *buf = NULL; + // frame_size is stored in buffer_alloc_sz, which is an int. If it won't + // fit, fail early. + if (frame_size > INT_MAX) { + return -1; + } + if (cb != NULL) { const int align_addr_extra_size = 31; const uint64_t external_frame_size = frame_size + align_addr_extra_size; @@ -193,8 +200,6 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, vpx_free(ybf->buffer_alloc); ybf->buffer_alloc = NULL; - if (frame_size != (size_t)frame_size) return -1; - ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, (size_t)frame_size); if (!ybf->buffer_alloc) return -1; diff --git a/libvpx/vpx_scale/generic/yv12extend.c b/libvpx/vpx_scale/generic/yv12extend.c index a6aaff95a..e23180650 100644 --- a/libvpx/vpx_scale/generic/yv12extend.c +++ b/libvpx/vpx_scale/generic/yv12extend.c @@ -111,25 +111,6 @@ void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) { assert(ybf->y_height - ybf->y_crop_height >= 0); assert(ybf->y_width - ybf->y_crop_width >= 0); -#if CONFIG_VP9_HIGHBITDEPTH - if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { - extend_plane_high(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, - ybf->y_crop_height, ybf->border, ybf->border, - ybf->border + ybf->y_height - ybf->y_crop_height, - ybf->border + ybf->y_width - ybf->y_crop_width); - - extend_plane_high(ybf->u_buffer, ybf->uv_stride, ybf->uv_crop_width, - ybf->uv_crop_height, uv_border, uv_border, - uv_border + ybf->uv_height - ybf->uv_crop_height, - uv_border + ybf->uv_width - ybf->uv_crop_width); - - extend_plane_high(ybf->v_buffer, ybf->uv_stride, ybf->uv_crop_width, - ybf->uv_crop_height, uv_border, uv_border, - uv_border + ybf->uv_height - ybf->uv_crop_height, - uv_border + ybf->uv_width - ybf->uv_crop_width); - return; - } -#endif extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, ybf->y_crop_height, ybf->border, ybf->border, ybf->border + ybf->y_height - ybf->y_crop_height, @@ -208,6 +189,7 @@ static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) { // Copies the source image into the destination image and updates the // destination's UMV borders. // Note: The frames are assumed to be identical in size. + void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc) { int row; @@ -222,6 +204,48 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc, assert(src_ybc->y_height == dst_ybc->y_height); #endif + for (row = 0; row < src_ybc->y_height; ++row) { + memcpy(dst, src, src_ybc->y_width); + src += src_ybc->y_stride; + dst += dst_ybc->y_stride; + } + + src = src_ybc->u_buffer; + dst = dst_ybc->u_buffer; + + for (row = 0; row < src_ybc->uv_height; ++row) { + memcpy(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; + } + + src = src_ybc->v_buffer; + dst = dst_ybc->v_buffer; + + for (row = 0; row < src_ybc->uv_height; ++row) { + memcpy(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; + } + + vp8_yv12_extend_frame_borders_c(dst_ybc); +} + +#if CONFIG_VP9 +void vpx_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc) { + int row; + const uint8_t *src = src_ybc->y_buffer; + uint8_t *dst = dst_ybc->y_buffer; + +#if 0 + /* These assertions are valid in the codec, but the libvpx-tester uses + * this code slightly differently. + */ + assert(src_ybc->y_width == dst_ybc->y_width); + assert(src_ybc->y_height == dst_ybc->y_height); +#endif + #if CONFIG_VP9_HIGHBITDEPTH if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) { assert(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH); @@ -249,7 +273,7 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc, dst += dst_ybc->uv_stride; } - vp8_yv12_extend_frame_borders_c(dst_ybc); + vpx_extend_frame_borders_c(dst_ybc); return; } else { assert(!(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH)); @@ -280,8 +304,9 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc, dst += dst_ybc->uv_stride; } - vp8_yv12_extend_frame_borders_c(dst_ybc); + vpx_extend_frame_borders_c(dst_ybc); } +#endif // CONFIG_VP9 void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc) { diff --git a/libvpx/vpx_scale/vpx_scale_rtcd.pl b/libvpx/vpx_scale/vpx_scale_rtcd.pl index 44b115c7e..1281071a7 100644 --- a/libvpx/vpx_scale/vpx_scale_rtcd.pl +++ b/libvpx/vpx_scale/vpx_scale_rtcd.pl @@ -1,3 +1,13 @@ +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + sub vpx_scale_forward_decls() { print <<EOF struct yv12_buffer_config; @@ -23,6 +33,8 @@ add_proto qw/void vp8_yv12_copy_frame/, "const struct yv12_buffer_config *src_yb add_proto qw/void vpx_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"; if (vpx_config("CONFIG_VP9") eq "yes") { + add_proto qw/void vpx_yv12_copy_frame/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"; + add_proto qw/void vpx_extend_frame_borders/, "struct yv12_buffer_config *ybf"; specialize qw/vpx_extend_frame_borders dspr2/; diff --git a/libvpx/vpx_util/vpx_atomics.h b/libvpx/vpx_util/vpx_atomics.h new file mode 100644 index 000000000..b8cf80dae --- /dev/null +++ b/libvpx/vpx_util/vpx_atomics.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_UTIL_VPX_ATOMICS_H_ +#define VPX_UTIL_VPX_ATOMICS_H_ + +#include "./vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD + +// Look for built-in atomic support. We cannot use <stdatomic.h> or <atomic> +// since neither is guaranteed to exist on both C and C++ platforms, and we need +// to back the atomic type with the same type (g++ needs to be able to use +// gcc-built code). g++ 6 doesn't support _Atomic as a keyword and can't use the +// stdatomic.h header. Even if both <stdatomic.h> and <atomic> existed it's not +// guaranteed that atomic_int is the same type as std::atomic_int. +// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=60932#c13. +#if !defined(__has_builtin) +#define __has_builtin(x) 0 // Compatibility with non-clang compilers. +#endif // !defined(__has_builtin) + +#if (__has_builtin(__atomic_load_n)) || \ + (defined(__GNUC__) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))) +// For GCC >= 4.7 and Clang versions that support __atomic builtins, use those. +#define VPX_USE_ATOMIC_BUILTINS +#else +// Use platform-specific asm barriers. +#if defined(_MSC_VER) +// TODO(pbos): This assumes that newer versions of MSVC are building with the +// default /volatile:ms (or older, where this is always true. Consider adding +// support for using <atomic> instead of stdatomic.h when building C++11 under +// MSVC. It's unclear what to do for plain C under /volatile:iso (inline asm?), +// there're no explicit Interlocked* functions for only storing or loading +// (presumably because volatile has historically implied that on MSVC). +// +// For earlier versions of MSVC or the default /volatile:ms volatile int are +// acquire/release and require no barrier. +#define vpx_atomic_memory_barrier() \ + do { \ + } while (0) +#else +#if ARCH_X86 || ARCH_X86_64 +// Use a compiler barrier on x86, no runtime penalty. +#define vpx_atomic_memory_barrier() __asm__ __volatile__("" ::: "memory") +#elif ARCH_ARM +#define vpx_atomic_memory_barrier() __asm__ __volatile__("dmb ish" ::: "memory") +#elif ARCH_MIPS +#define vpx_atomic_memory_barrier() __asm__ __volatile__("sync" ::: "memory") +#else +#error Unsupported architecture! +#endif // ARCH_X86 || ARCH_X86_64 +#endif // defined(_MSC_VER) +#endif // atomic builtin availability check + +// These are wrapped in a struct so that they are not easily accessed directly +// on any platform (to discourage programmer errors by setting values directly). +// This primitive MUST be initialized using vpx_atomic_init or VPX_ATOMIC_INIT +// (NOT memset) and accessed through vpx_atomic_ functions. +typedef struct vpx_atomic_int { volatile int value; } vpx_atomic_int; + +#define VPX_ATOMIC_INIT(num) \ + { num } + +// Initialization of an atomic int, not thread safe. +static INLINE void vpx_atomic_init(vpx_atomic_int *atomic, int value) { + atomic->value = value; +} + +static INLINE void vpx_atomic_store_release(vpx_atomic_int *atomic, int value) { +#if defined(VPX_USE_ATOMIC_BUILTINS) + __atomic_store_n(&atomic->value, value, __ATOMIC_RELEASE); +#else + vpx_atomic_memory_barrier(); + atomic->value = value; +#endif // defined(VPX_USE_ATOMIC_BUILTINS) +} + +static INLINE int vpx_atomic_load_acquire(const vpx_atomic_int *atomic) { +#if defined(VPX_USE_ATOMIC_BUILTINS) + return __atomic_load_n(&atomic->value, __ATOMIC_ACQUIRE); +#else + int v = atomic->value; + vpx_atomic_memory_barrier(); + return v; +#endif // defined(VPX_USE_ATOMIC_BUILTINS) +} + +#undef VPX_USE_ATOMIC_BUILTINS +#undef vpx_atomic_memory_barrier + +#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */ + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // VPX_UTIL_VPX_ATOMICS_H_ diff --git a/libvpx/vpx_util/vpx_util.mk b/libvpx/vpx_util/vpx_util.mk index c0ef8d336..86d3ece3c 100644 --- a/libvpx/vpx_util/vpx_util.mk +++ b/libvpx/vpx_util/vpx_util.mk @@ -8,7 +8,10 @@ ## be found in the AUTHORS file in the root of the source tree. ## +UTIL_SRCS-yes += vpx_atomics.h UTIL_SRCS-yes += vpx_util.mk UTIL_SRCS-yes += vpx_thread.c UTIL_SRCS-yes += vpx_thread.h UTIL_SRCS-yes += endian_inl.h +UTIL_SRCS-yes += vpx_write_yuv_frame.h +UTIL_SRCS-yes += vpx_write_yuv_frame.c diff --git a/libvpx/vpx_util/vpx_write_yuv_frame.c b/libvpx/vpx_util/vpx_write_yuv_frame.c new file mode 100644 index 000000000..ab6855811 --- /dev/null +++ b/libvpx/vpx_util/vpx_write_yuv_frame.c @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_dsp/skin_detection.h" +#include "vpx_util/vpx_write_yuv_frame.h" + +void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) { +#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED) || \ + defined(OUTPUT_YUV_SKINMAP) + + unsigned char *src = s->y_buffer; + int h = s->y_crop_height; + + do { + fwrite(src, s->y_width, 1, yuv_file); + src += s->y_stride; + } while (--h); + + src = s->u_buffer; + h = s->uv_crop_height; + + do { + fwrite(src, s->uv_width, 1, yuv_file); + src += s->uv_stride; + } while (--h); + + src = s->v_buffer; + h = s->uv_crop_height; + + do { + fwrite(src, s->uv_width, 1, yuv_file); + src += s->uv_stride; + } while (--h); + +#else + (void)yuv_file; + (void)s; +#endif +} diff --git a/libvpx/vpx_util/vpx_write_yuv_frame.h b/libvpx/vpx_util/vpx_write_yuv_frame.h new file mode 100644 index 000000000..1cb702981 --- /dev/null +++ b/libvpx/vpx_util/vpx_write_yuv_frame.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_UTIL_VPX_WRITE_YUV_FRAME_H_ +#define VPX_UTIL_VPX_WRITE_YUV_FRAME_H_ + +#include <stdio.h> +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_UTIL_VPX_WRITE_YUV_FRAME_H_ diff --git a/libvpx/vpxdec.c b/libvpx/vpxdec.c index 6db2afb4a..ff20e6a3c 100644 --- a/libvpx/vpxdec.c +++ b/libvpx/vpxdec.c @@ -47,6 +47,8 @@ struct VpxDecInputContext { struct WebmInputContext *webm_ctx; }; +static const arg_def_t help = + ARG_DEF(NULL, "help", 0, "Show usage options and exit"); static const arg_def_t looparg = ARG_DEF(NULL, "loops", 1, "Number of times to decode the file"); static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use"); @@ -98,17 +100,17 @@ static const arg_def_t framestatsarg = ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)"); static const arg_def_t *all_args[] = { - &codecarg, &use_yv12, &use_i420, - &flipuvarg, &rawvideo, &noblitarg, - &progressarg, &limitarg, &skiparg, - &postprocarg, &summaryarg, &outputfile, - &threadsarg, &frameparallelarg, &verbosearg, - &scalearg, &fb_arg, &md5arg, - &error_concealment, &continuearg, + &help, &codecarg, &use_yv12, + &use_i420, &flipuvarg, &rawvideo, + &noblitarg, &progressarg, &limitarg, + &skiparg, &postprocarg, &summaryarg, + &outputfile, &threadsarg, &frameparallelarg, + &verbosearg, &scalearg, &fb_arg, + &md5arg, &error_concealment, &continuearg, #if CONFIG_VP9_HIGHBITDEPTH &outbitdeptharg, #endif - &svcdecodingarg, &framestatsarg, NULL + &svcdecodingarg, &framestatsarg, NULL }; #if CONFIG_VP8_DECODER @@ -152,41 +154,47 @@ static INLINE int libyuv_scale(vpx_image_t *src, vpx_image_t *dst, dst->d_h, mode); } #endif - -void usage_exit(void) { +void show_help(FILE *fout, int shorthelp) { int i; - fprintf(stderr, - "Usage: %s <options> filename\n\n" - "Options:\n", - exec_name); - arg_show_usage(stderr, all_args); + fprintf(fout, "Usage: %s <options> filename\n\n", exec_name); + + if (shorthelp) { + fprintf(fout, "Use --help to see the full list of options.\n"); + return; + } + + fprintf(fout, "Options:\n"); + arg_show_usage(fout, all_args); #if CONFIG_VP8_DECODER - fprintf(stderr, "\nVP8 Postprocessing Options:\n"); - arg_show_usage(stderr, vp8_pp_args); + fprintf(fout, "\nVP8 Postprocessing Options:\n"); + arg_show_usage(fout, vp8_pp_args); #endif - fprintf(stderr, + fprintf(fout, "\nOutput File Patterns:\n\n" " The -o argument specifies the name of the file(s) to " "write to. If the\n argument does not include any escape " "characters, the output will be\n written to a single file. " "Otherwise, the filename will be calculated by\n expanding " "the following escape characters:\n"); - fprintf(stderr, + fprintf(fout, "\n\t%%w - Frame width" "\n\t%%h - Frame height" "\n\t%%<n> - Frame number, zero padded to <n> places (1..9)" "\n\n Pattern arguments are only supported in conjunction " "with the --yv12 and\n --i420 options. If the -o option is " "not specified, the output will be\n directed to stdout.\n"); - fprintf(stderr, "\nIncluded decoders:\n\n"); + fprintf(fout, "\nIncluded decoders:\n\n"); for (i = 0; i < get_vpx_decoder_count(); ++i) { const VpxInterface *const decoder = get_vpx_decoder_by_index(i); - fprintf(stderr, " %-6s - %s\n", decoder->name, + fprintf(fout, " %-6s - %s\n", decoder->name, vpx_codec_iface_name(decoder->codec_interface())); } +} +void usage_exit(void) { + show_help(stderr, 1); exit(EXIT_FAILURE); } @@ -554,7 +562,10 @@ static int main_loop(int argc, const char **argv_) { memset(&arg, 0, sizeof(arg)); arg.argv_step = 1; - if (arg_match(&arg, &codecarg, argi)) { + if (arg_match(&arg, &help, argi)) { + show_help(stdout, 0); + exit(EXIT_SUCCESS); + } else if (arg_match(&arg, &codecarg, argi)) { interface = get_vpx_decoder_by_name(arg.val); if (!interface) die("Error: Unrecognized argument (%s) to --codec\n", arg.val); @@ -651,6 +662,7 @@ static int main_loop(int argc, const char **argv_) { if (!fn) { free(argv); + fprintf(stderr, "No input file specified!\n"); usage_exit(); } /* Open file */ diff --git a/libvpx/vpxenc.c b/libvpx/vpxenc.c index 6c887dfeb..4db7eccc3 100644 --- a/libvpx/vpxenc.c +++ b/libvpx/vpxenc.c @@ -123,6 +123,8 @@ static int fourcc_is_ivf(const char detect[4]) { return 0; } +static const arg_def_t help = + ARG_DEF(NULL, "help", 0, "Show usage options and exit"); static const arg_def_t debugmode = ARG_DEF("D", "debug", 0, "Debug mode (makes output deterministic)"); static const arg_def_t outputfile = @@ -199,7 +201,8 @@ static const arg_def_t test16bitinternalarg = ARG_DEF( NULL, "test-16bit-internal", 0, "Force use of 16 bit internal buffer"); #endif -static const arg_def_t *main_args[] = { &debugmode, +static const arg_def_t *main_args[] = { &help, + &debugmode, &outputfile, &codecarg, &passes, @@ -321,8 +324,11 @@ static const arg_def_t minsection_pct = ARG_DEF(NULL, "minsection-pct", 1, "GOP min bitrate (% of target)"); static const arg_def_t maxsection_pct = ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)"); -static const arg_def_t *rc_twopass_args[] = { &bias_pct, &minsection_pct, - &maxsection_pct, NULL }; +static const arg_def_t corpus_complexity = + ARG_DEF(NULL, "corpus-complexity", 1, "corpus vbr complexity midpoint"); +static const arg_def_t *rc_twopass_args[] = { + &bias_pct, &minsection_pct, &maxsection_pct, &corpus_complexity, NULL +}; static const arg_def_t kf_min_dist = ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)"); @@ -441,8 +447,8 @@ static const struct arg_enum_list color_space_enum[] = { }; static const arg_def_t input_color_space = - ARG_DEF_ENUM(NULL, "color-space", 1, "The color space of input content:", - color_space_enum); + ARG_DEF_ENUM(NULL, "color-space", 1, + "The color space of input content:", color_space_enum); #if CONFIG_VP9_HIGHBITDEPTH static const struct arg_enum_list bitdepth_enum[] = { @@ -460,6 +466,7 @@ static const arg_def_t inbitdeptharg = static const struct arg_enum_list tune_content_enum[] = { { "default", VP9E_CONTENT_DEFAULT }, { "screen", VP9E_CONTENT_SCREEN }, + { "film", VP9E_CONTENT_FILM }, { NULL, 0 } }; @@ -468,8 +475,14 @@ static const arg_def_t tune_content = ARG_DEF_ENUM( static const arg_def_t target_level = ARG_DEF( NULL, "target-level", 1, - "Target level (255: off (default); 0: only keep level stats; 10: level 1.0;" - " 11: level 1.1; ... 62: level 6.2)"); + "Target level\n" + " 255: off (default)\n" + " 0: only keep level stats\n" + " 1: adaptively set alt-ref " + "distance and column tile limit based on picture size, and keep" + " level stats\n" + " 10: level 1.0 11: level 1.1 " + "... 62: level 6.2"); static const arg_def_t row_mt = ARG_DEF(NULL, "row-mt", 1, @@ -539,46 +552,54 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED, static const arg_def_t *no_args[] = { NULL }; -void usage_exit(void) { +void show_help(FILE *fout, int shorthelp) { int i; const int num_encoder = get_vpx_encoder_count(); - fprintf(stderr, "Usage: %s <options> -o dst_filename src_filename \n", + fprintf(fout, "Usage: %s <options> -o dst_filename src_filename \n", exec_name); - fprintf(stderr, "\nOptions:\n"); - arg_show_usage(stderr, main_args); - fprintf(stderr, "\nEncoder Global Options:\n"); - arg_show_usage(stderr, global_args); - fprintf(stderr, "\nRate Control Options:\n"); - arg_show_usage(stderr, rc_args); - fprintf(stderr, "\nTwopass Rate Control Options:\n"); - arg_show_usage(stderr, rc_twopass_args); - fprintf(stderr, "\nKeyframe Placement Options:\n"); - arg_show_usage(stderr, kf_args); + if (shorthelp) { + fprintf(fout, "Use --help to see the full list of options.\n"); + return; + } + + fprintf(fout, "\nOptions:\n"); + arg_show_usage(fout, main_args); + fprintf(fout, "\nEncoder Global Options:\n"); + arg_show_usage(fout, global_args); + fprintf(fout, "\nRate Control Options:\n"); + arg_show_usage(fout, rc_args); + fprintf(fout, "\nTwopass Rate Control Options:\n"); + arg_show_usage(fout, rc_twopass_args); + fprintf(fout, "\nKeyframe Placement Options:\n"); + arg_show_usage(fout, kf_args); #if CONFIG_VP8_ENCODER - fprintf(stderr, "\nVP8 Specific Options:\n"); - arg_show_usage(stderr, vp8_args); + fprintf(fout, "\nVP8 Specific Options:\n"); + arg_show_usage(fout, vp8_args); #endif #if CONFIG_VP9_ENCODER - fprintf(stderr, "\nVP9 Specific Options:\n"); - arg_show_usage(stderr, vp9_args); + fprintf(fout, "\nVP9 Specific Options:\n"); + arg_show_usage(fout, vp9_args); #endif - fprintf(stderr, + fprintf(fout, "\nStream timebase (--timebase):\n" " The desired precision of timestamps in the output, expressed\n" " in fractional seconds. Default is 1/1000.\n"); - fprintf(stderr, "\nIncluded encoders:\n\n"); + fprintf(fout, "\nIncluded encoders:\n\n"); for (i = 0; i < num_encoder; ++i) { const VpxInterface *const encoder = get_vpx_encoder_by_index(i); const char *defstr = (i == (num_encoder - 1)) ? "(default)" : ""; - fprintf(stderr, " %-6s - %s %s\n", encoder->name, + fprintf(fout, " %-6s - %s %s\n", encoder->name, vpx_codec_iface_name(encoder->codec_interface()), defstr); } - fprintf(stderr, "\n "); - fprintf(stderr, "Use --codec to switch to a non-default encoder.\n\n"); + fprintf(fout, "\n "); + fprintf(fout, "Use --codec to switch to a non-default encoder.\n\n"); +} +void usage_exit(void) { + show_help(stderr, 1); exit(EXIT_FAILURE); } @@ -893,7 +914,10 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) { for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { arg.argv_step = 1; - if (arg_match(&arg, &codecarg, argi)) { + if (arg_match(&arg, &help, argi)) { + show_help(stdout, 0); + exit(EXIT_SUCCESS); + } else if (arg_match(&arg, &codecarg, argi)) { global->codec = get_vpx_encoder_by_name(arg.val); if (!global->codec) die("Error: Unrecognized argument (%s) to --codec\n", arg.val); @@ -1229,6 +1253,11 @@ static int parse_stream_params(struct VpxEncoderConfig *global, if (global->passes < 2) warn("option %s ignored in one-pass mode.\n", arg.name); + } else if (arg_match(&arg, &corpus_complexity, argi)) { + config->cfg.rc_2pass_vbr_corpus_complexity = arg_parse_uint(&arg); + + if (global->passes < 2) + warn("option %s ignored in one-pass mode.\n", arg.name); } else if (arg_match(&arg, &kf_min_dist, argi)) { config->cfg.kf_min_dist = arg_parse_uint(&arg); } else if (arg_match(&arg, &kf_max_dist, argi)) { @@ -1425,6 +1454,7 @@ static void show_stream_config(struct stream_state *stream, SHOW(rc_2pass_vbr_bias_pct); SHOW(rc_2pass_vbr_minsection_pct); SHOW(rc_2pass_vbr_maxsection_pct); + SHOW(rc_2pass_vbr_corpus_complexity); SHOW(kf_mode); SHOW(kf_min_dist); SHOW(kf_max_dist); @@ -1889,8 +1919,6 @@ int main(int argc, const char **argv_) { memset(&input, 0, sizeof(input)); exec_name = argv_[0]; - if (argc < 3) usage_exit(); - /* Setup default input stream settings */ input.framerate.numerator = 30; input.framerate.denominator = 1; @@ -1904,6 +1932,8 @@ int main(int argc, const char **argv_) { argv = argv_dup(argc - 1, argv_ + 1); parse_global_config(&global, argv); + if (argc < 3) usage_exit(); + switch (global.color_type) { case I420: input.fmt = VPX_IMG_FMT_I420; break; case I422: input.fmt = VPX_IMG_FMT_I422; break; @@ -1937,7 +1967,10 @@ int main(int argc, const char **argv_) { /* Handle non-option arguments */ input.filename = argv[0]; - if (!input.filename) usage_exit(); + if (!input.filename) { + fprintf(stderr, "No input file specified!\n"); + usage_exit(); + } /* Decide if other chroma subsamplings than 4:2:0 are supported */ if (global.codec->fourcc == VP9_FOURCC) input.only_i420 = 0; diff --git a/libvpx/y4minput.c b/libvpx/y4minput.c index acf7d69fe..1de636cc0 100644 --- a/libvpx/y4minput.c +++ b/libvpx/y4minput.c @@ -195,26 +195,29 @@ static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst, window.*/ for (x = 0; x < OC_MINI(_c_w, 2); x++) { _dst[x] = (unsigned char)OC_CLAMPI( - 0, (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] + - 35 * _src[OC_MINI(x + 1, _c_w - 1)] - - 9 * _src[OC_MINI(x + 2, _c_w - 1)] + - _src[OC_MINI(x + 3, _c_w - 1)] + 64) >> - 7, + 0, + (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] + + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - + 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[OC_MINI(x + 3, _c_w - 1)] + + 64) >> + 7, 255); } for (; x < _c_w - 3; x++) { _dst[x] = (unsigned char)OC_CLAMPI( - 0, (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + - 35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >> - 7, + 0, + (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + + 35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >> + 7, 255); } for (; x < _c_w; x++) { _dst[x] = (unsigned char)OC_CLAMPI( - 0, (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + - 35 * _src[OC_MINI(x + 1, _c_w - 1)] - - 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >> - 7, + 0, + (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - + 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >> + 7, 255); } _dst += _c_w; @@ -314,28 +317,31 @@ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst, for (x = 0; x < c_w; x++) { for (y = 0; y < OC_MINI(c_h, 3); y++) { _dst[y * c_w] = (unsigned char)OC_CLAMPI( - 0, (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] + - 35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] - - 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + - 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >> - 7, + 0, + (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] + + 35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + + 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >> + 7, 255); } for (; y < c_h - 2; y++) { _dst[y * c_w] = (unsigned char)OC_CLAMPI( - 0, (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + - 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - - 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >> - 7, + 0, + (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >> + 7, 255); } for (; y < c_h; y++) { _dst[y * c_w] = (unsigned char)OC_CLAMPI( - 0, (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + - 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - - 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + - 4 * tmp[(c_h - 1) * c_w] + 64) >> - 7, + 0, + (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + + 4 * tmp[(c_h - 1) * c_w] + 64) >> + 7, 255); } _dst++; @@ -361,10 +367,11 @@ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst, } for (; y < c_h - 3; y++) { _dst[y * c_w] = (unsigned char)OC_CLAMPI( - 0, (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] + - 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] - - 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >> - 7, + 0, + (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] + + 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] - + 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >> + 7, 255); } for (; y < c_h; y++) { @@ -404,18 +411,20 @@ static void y4m_422jpeg_420jpeg_helper(unsigned char *_dst, for (x = 0; x < _c_w; x++) { for (y = 0; y < OC_MINI(_c_h, 2); y += 2) { _dst[(y >> 1) * _c_w] = - OC_CLAMPI(0, (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] - - 17 * _src[OC_MINI(2, _c_h - 1) * _c_w] + - 3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >> - 7, + OC_CLAMPI(0, + (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] - + 17 * _src[OC_MINI(2, _c_h - 1) * _c_w] + + 3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >> + 7, 255); } for (; y < _c_h - 3; y += 2) { _dst[(y >> 1) * _c_w] = - OC_CLAMPI(0, (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) - - 17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) + - 78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >> - 7, + OC_CLAMPI(0, + (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) - + 17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) + + 78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >> + 7, 255); } for (; y < _c_h; y += 2) { @@ -642,33 +651,38 @@ static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst, 4-tap Mitchell window.*/ for (x = 0; x < OC_MINI(c_w, 1); x++) { tmp[x << 1] = (unsigned char)OC_CLAMPI( - 0, (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] - - _aux[OC_MINI(2, c_w - 1)] + 64) >> - 7, + 0, + (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] - + _aux[OC_MINI(2, c_w - 1)] + 64) >> + 7, 255); tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( - 0, (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] - - 5 * _aux[OC_MINI(2, c_w - 1)] + 64) >> - 7, + 0, + (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] - + 5 * _aux[OC_MINI(2, c_w - 1)] + 64) >> + 7, 255); } for (; x < c_w - 2; x++) { tmp[x << 1] = - (unsigned char)OC_CLAMPI(0, (_aux[x - 1] + 110 * _aux[x] + - 18 * _aux[x + 1] - _aux[x + 2] + 64) >> - 7, + (unsigned char)OC_CLAMPI(0, + (_aux[x - 1] + 110 * _aux[x] + + 18 * _aux[x + 1] - _aux[x + 2] + 64) >> + 7, 255); tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( - 0, (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] - - 5 * _aux[x + 2] + 64) >> - 7, + 0, + (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] - + 5 * _aux[x + 2] + 64) >> + 7, 255); } for (; x < c_w; x++) { tmp[x << 1] = (unsigned char)OC_CLAMPI( - 0, (_aux[x - 1] + 110 * _aux[x] + - 18 * _aux[OC_MINI(x + 1, c_w - 1)] - _aux[c_w - 1] + 64) >> - 7, + 0, + (_aux[x - 1] + 110 * _aux[x] + 18 * _aux[OC_MINI(x + 1, c_w - 1)] - + _aux[c_w - 1] + 64) >> + 7, 255); if ((x << 1 | 1) < dst_c_w) { tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( @@ -718,27 +732,29 @@ static void y4m_convert_444_420jpeg(y4m_input *_y4m, unsigned char *_dst, /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/ for (y = 0; y < c_h; y++) { for (x = 0; x < OC_MINI(c_w, 2); x += 2) { - tmp[x >> 1] = - OC_CLAMPI(0, (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] - - 17 * _aux[OC_MINI(2, c_w - 1)] + - 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >> - 7, - 255); + tmp[x >> 1] = OC_CLAMPI(0, + (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] - + 17 * _aux[OC_MINI(2, c_w - 1)] + + 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >> + 7, + 255); } for (; x < c_w - 3; x += 2) { - tmp[x >> 1] = OC_CLAMPI(0, (3 * (_aux[x - 2] + _aux[x + 3]) - - 17 * (_aux[x - 1] + _aux[x + 2]) + - 78 * (_aux[x] + _aux[x + 1]) + 64) >> - 7, + tmp[x >> 1] = OC_CLAMPI(0, + (3 * (_aux[x - 2] + _aux[x + 3]) - + 17 * (_aux[x - 1] + _aux[x + 2]) + + 78 * (_aux[x] + _aux[x + 1]) + 64) >> + 7, 255); } for (; x < c_w; x += 2) { - tmp[x >> 1] = OC_CLAMPI( - 0, (3 * (_aux[x - 2] + _aux[c_w - 1]) - - 17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) + - 78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >> - 7, - 255); + tmp[x >> 1] = + OC_CLAMPI(0, + (3 * (_aux[x - 2] + _aux[c_w - 1]) - + 17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) + + 78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >> + 7, + 255); } tmp += dst_c_w; _aux += c_w; diff --git a/libwebm/Android.bp b/libwebm/Android.bp index a1d335ebc..753746e84 100644 --- a/libwebm/Android.bp +++ b/libwebm/Android.bp @@ -1,6 +1,7 @@ cc_library_static { name: "libwebm", srcs: ["mkvparser/mkvparser.cc"], + cflags: ["-Wall", "-Werror"], export_include_dirs: ["."], sanitize: { cfi: true, |